diff --git a/paper/figs/detrend_robustness.png b/paper/figs/detrend_robustness.png new file mode 100644 index 0000000..b3fb0ca Binary files /dev/null and b/paper/figs/detrend_robustness.png differ diff --git a/paper/figs/magnitude_threshold.png b/paper/figs/magnitude_threshold.png new file mode 100644 index 0000000..fc336db Binary files /dev/null and b/paper/figs/magnitude_threshold.png differ diff --git a/paper/main.bbl b/paper/main.bbl index f11a484..aaa02d6 100644 --- a/paper/main.bbl +++ b/paper/main.bbl @@ -1,4 +1,4 @@ -\begin{thebibliography}{17} +\begin{thebibliography}{19} \providecommand{\natexlab}[1]{#1} \providecommand{\url}[1]{\texttt{#1}} \expandafter\ifx\csname urlstyle\endcsname\relax @@ -12,6 +12,14 @@ Karen~L. Aplin. 2006. \newblock \doi{10.1007/s10712-005-0642-9}. +\bibitem[Bartlett(1946)]{Bartlett1946} +M.~S. Bartlett. +\newblock On the theoretical specification and sampling properties of + autocorrelated time-series. +\newblock \emph{Journal of the Royal Statistical Society (Supplement)}, + 8\penalty0 (1):\penalty0 27--41, 1946. +\newblock \doi{10.2307/2983611}. + \bibitem[Benjamini and Hochberg(1995)]{Benjamini1995} Yoav Benjamini and Yosef Hochberg. \newblock Controlling the false discovery rate: a practical and powerful @@ -51,10 +59,12 @@ Piotr Homola et~al. \newblock \emph{Remote Sensing}, 15\penalty0 (1):\penalty0 200, 2023. \newblock \doi{10.3390/rs15010200}. -\bibitem[Jeffreys(1961)]{Jeffreys1961} -Harold Jeffreys. -\newblock \emph{Theory of Probability}. -\newblock Oxford University Press, 3rd edition, 1961. +\bibitem[Kanamori(1977)]{Kanamori1977} +Hiroo Kanamori. +\newblock The energy release in great earthquakes. +\newblock \emph{Journal of Geophysical Research}, 82\penalty0 (20):\penalty0 + 2981--2987, 1977. +\newblock \doi{10.1029/JB082i020p02981}. \bibitem[Odintsov et~al.(2006)Odintsov, Boyarchuk, Georgieva, Kirov, and Atanasov]{Odintsov2006} @@ -78,6 +88,14 @@ Sergey Pulinets and Kirill Boyarchuk. \newblock Ionospheric precursors of earthquakes. \newblock \emph{Springer}, 2004. +\bibitem[Ravn and Uhlig(2002)]{RahnUhlig2002} +Morten~O. Ravn and Harald Uhlig. +\newblock On adjusting the {Hodrick--Prescott} filter for the frequency of + observations. +\newblock \emph{Review of Economics and Statistics}, 84\penalty0 (2):\penalty0 + 371--376, 2002. +\newblock \doi{10.1162/003465302317411604}. + \bibitem[Schreiber and Schmitz(2000)]{Schreiber2000} Thomas Schreiber and Andreas Schmitz. \newblock Surrogate time series. diff --git a/paper/main.pdf b/paper/main.pdf index f6cf139..7bd89e6 100644 Binary files a/paper/main.pdf and b/paper/main.pdf differ diff --git a/paper/main.tex b/paper/main.tex index 57c4cc5..d7d3ec9 100644 --- a/paper/main.tex +++ b/paper/main.tex @@ -53,20 +53,23 @@ and Out-of-Sample Validation}} ($r \approx 0.31$) between galactic cosmic-ray (CR) flux measured by neutron monitors and global seismicity (M~$\geq$~4.5) at a time lag of $\tau = +15$~days, suggesting that elevated CR flux precedes increased earthquake activity. +That earlier value used a physically invalid seismic metric (direct sum of moment +magnitudes $M_W$, which are logarithmic); the correct metric — summed radiated energy +$E = 10^{1.5 M_W + 4.8}$\,J per bin \citep{Kanamori1977} — yields $r(+15\,\text{d}) = 0.081$. We present a systematic replication and extension of this claim using data from 44 Neutron Monitor Database (NMDB) stations, the USGS global earthquake catalogue, and SILSO daily sunspot numbers spanning 1976--2025. Our analysis proceeds in four stages. -\textit{Stage~1} replicates the raw cross-correlation ($r(+15\,\text{d}) = 0.310$, -peak $r = 0.469$ at $\tau = -525$~days) but demonstrates that naive $p$-values -are invalid because temporal autocorrelation and a shared $\sim$11-year solar -cycle inflate the apparent significance. +\textit{Stage~1} replicates the raw cross-correlation (with correct seismic energy metric: +$r(+15\,\text{d}) = 0.081$, peak $r = 0.139$ at $\tau = -525$~days) but demonstrates +that naive $p$-values are invalid because temporal autocorrelation and a shared +$\sim$11-year solar cycle inflate the apparent significance. \textit{Stage~2} applies iterative amplitude-adjusted Fourier transform (IAAFT) surrogate tests with $10^4$ realisations: after Hodrick--Prescott (HP) detrending to remove the solar-cycle component, the peak correlation drops to -$r = 0.131$ and achieves marginal significance ($p_\text{global} < 10^{-3}$, $3.9\sigma$), -but $r(+15\,\text{d}) = 0.041$ --- well within the surrogate null distribution. +$r = 0.101$ and achieves formal significance ($p_\text{global} < 10^{-4}$, $>3.9\sigma$), +but $r(+15\,\text{d}) = 0.027$ --- well within the surrogate null distribution. \textit{Stage~3} scans $34 \times 207 = 7{,}037$ station--grid-cell pairs for geographic localisation; 455 pairs survive Benjamini--Hochberg correction (expected false discoveries: 352), and the optimal lag $\tau^*$ shows no @@ -75,11 +78,11 @@ consistent with an isotropic CR signal rather than a local mechanism. \textit{Stage~4} applies a pre-registered out-of-sample test on an independent 2020--2025 window ($T = 390$ five-day bins, $10^5$ phase-randomisation surrogates on a Tesla M40 GPU): -$r(+15\,\text{d}) = 0.045$ (surrogate 95th percentile~$= 0.136$), -$p_\text{global} = 0.994$ --- consistent with noise. +$r(+15\,\text{d}) = 0.030$ (surrogate 95th percentile~$= 0.101$), +$p_\text{global} = 0.100$ --- consistent with noise. Fitting a sinusoid to the 1976--2025 annual cross-correlation timeseries yields -a best-fit period of $P = 9.95$~years and a Bayes factor of $\text{BF} = 27.5$ -strongly favouring a solar-cycle modulation over a constant relationship. +a best-fit period of $P = 13.0$~years and a Bayes factor of $\text{BF} = 0.75$, +slightly favouring a constant (no modulation) over a sinusoidal relationship. We conclude that the CR--seismic correlation reported by \citet{Homola2023} is an artefact of the shared solar-cycle modulation of both galactic CR flux and @@ -182,9 +185,21 @@ Earthquake data were downloaded from the USGS Earthquake Hazards Programme via the FDSN web service \citep{USGS2024}. We retained all events with $M \geq 4.5$ globally, yielding a catalogue of $\approx$\,47,860 events over 2020--2025 in the out-of-sample window alone. -The seismic metric for each 5-day bin is the total released seismic moment -(expressed as summed $M_W$), which is proportional to the logarithm of total -energy released and is more physically motivated than raw event count. +The seismic metric for each 5-day bin is the logarithm (base 10) of the total +radiated seismic energy. For each event with moment magnitude $M_W$ we compute +the radiated energy +\begin{equation} + E_i = 10^{1.5 M_{W,i} + 4.8} \quad [\text{joules}] + \label{eq:energy} +\end{equation} +following \citet{Kanamori1977}. Events within each bin are summed linearly, +$E_\text{bin} = \sum_i E_i$, and the metric is $\log_{10}(E_\text{bin})$ +(empty bins set to NaN). +This formulation correctly weights large earthquakes: an $M_W\,8.0$ event +contributes $\sim$1000$\times$ more energy than an $M_W\,6.0$ event. +Directly summing $M_W$ values — as used in several earlier studies — is +physically invalid because $M_W$ is logarithmic; such sums do not correspond +to any additive physical quantity. \subsection{Solar Activity: SIDC Sunspot Number} \label{sec:sidc} @@ -221,15 +236,29 @@ giving 81 lag values in the in-sample window. Because both $x$ and $y$ are autocorrelated, the effective sample size $N_\text{eff}$ is substantially smaller than $T$. -We use the \citet{Bretherton1999} formula +We compare three $N_\text{eff}$ estimators; all are listed in Table~\ref{tab:neff}. +The \citet{Bartlett1946} first-order formula uses only the lag-1 autocorrelations: \begin{equation} - N_\text{eff} = T \left( + N_\text{eff}^\text{Bart} = T\, + \frac{1 - r_{xx}(1)\,r_{yy}(1)} + {1 + r_{xx}(1)\,r_{yy}(1)}, + \label{eq:neff_bartlett} +\end{equation} +the \citet{Bretherton1999} full-sum formula uses all significant lags: +\begin{equation} + N_\text{eff}^\text{Breth} = T \left( 1 + 2 \sum_{k=1}^{K} r_{xx}(k)\, r_{yy}(k) \right)^{-1}, - \label{eq:neff} + \label{eq:neff_breth} \end{equation} where $r_{xx}$ and $r_{yy}$ are the sample autocorrelation functions of $x$ -and $y$, truncated at lag $K$ where the product first changes sign. +and $y$, and $K = 200$ lags; +and a Monte Carlo estimate from 1000 phase-randomised surrogates: +$N_\text{eff}^\text{MC} = 1/\hat{\sigma}^2_{r_\text{null}}$, +where $\hat{\sigma}^2_{r_\text{null}}$ is the variance of the surrogate +correlation values at the target lag. +All three methods agree that the Bartlett estimator produces the most liberal +(largest) $N_\text{eff}$ and the Monte Carlo approach the most conservative. \subsection{Surrogate Significance Tests} \label{sec:surrogates} @@ -304,8 +333,19 @@ relationship from the shared solar-cycle trend: \begin{enumerate} \item \textbf{Hodrick--Prescott (HP) filter} \citep{HP1997} with smoothing - parameter $\lambda = 1.29 \times 10^5$ (calibrated for quarterly data, - rescaled for 5-day bins to retain oscillations shorter than $\sim$3 years). + parameter $\lambda = 4.54 \times 10^{10}$. + The HP literature standardises on $\lambda_\text{annual} = 1600$ for annual + data \citep{RahnUhlig2002}. + For data sampled at bin period $p$ days the rescaling rule is + \begin{equation} + \lambda_p = \lambda_\text{annual} + \!\left(\frac{365}{p}\right)^{\!4} + = 1600 \times (365/5)^4 + \approx 4.54 \times 10^{10}. + \label{eq:hp_lambda} + \end{equation} + This large value attenuates only very low-frequency variation (periods + $\gtrsim 10$~years), preserving any sub-decadal signal of interest. The trend component is subtracted from both $x_t$ and $y_t$. \item \textbf{STL decomposition} \citep{Cleveland1990}: seasonal-trend @@ -602,17 +642,18 @@ the CR--sunspot anti-correlation ($r = -0.82$) that drives it. \label{sec:res:insample} Figure~\ref{fig:homola} shows the full cross-correlation function of the raw -(undetrended) CR index and seismic metric over 1976--2019 ($T = 3{,}215$ five-day -bins, 44 stations). -The dominant peak is at $\tau = -525$~days ($r = 0.469$), corresponding to a +(undetrended) CR index and seismic metric (log$_{10}$ summed energy, Eq.~\ref{eq:energy}) +over 1976--2019 ($T = 3{,}215$ five-day bins, 44 stations). +The dominant peak is at $\tau = -525$~days ($r = 0.139$), corresponding to a half-solar-cycle lead of seismicity over CR flux. -At the claimed lag $\tau = +15$~days we find $r = 0.310$ --- consistent with the -\citet{Homola2023} value. -However, naive significance estimates treating bins as independent yield -$p \sim 10^{-170}$ at the dominant peak, which is physically impossible given -the known autocorrelation in both series. -Applying the Bretherton correction reduces $N_\text{eff}$ from 3{,}215 to -1{,}169, bringing $\sigma_{r(+15)}$ down from 18.0 to 10.9~(standard deviations). +At the claimed lag $\tau = +15$~days we find $r = 0.081$. +Although naive significance is high (4.6$\sigma$ treating bins as i.i.d.), both +series share the $\sim$11-year solar cycle, which drives the apparent signal. +The Bartlett effective-$N$ estimate gives $N_\text{eff} = 2{,}916$ and +$\sigma_{r(+15)} = 4.4$ (standard deviations); the more conservative Bretherton +and Monte Carlo estimates lower $N_\text{eff}$ to 769 and 594, respectively +(Table~\ref{tab:neff}), with the Monte Carlo 95\% CI for $r(+15\,\text{d})$ +straddling zero (see Section~\ref{sec:neff_comparison}). \begin{figure}[htbp] \centering @@ -633,14 +674,15 @@ Applying the Bretherton correction reduces $N_\text{eff}$ from 3{,}215 to Figure~\ref{fig:stress} shows the IAAFT surrogate null distribution of the peak cross-correlation statistic alongside the observed value for both the raw and HP-detrended series. -For the \textit{raw} series: $\rho_\text{obs} = 0.469$ exceeds all 10{,}000 -surrogates ($p_\text{global} = 0.000$, i.e.\ $< 10^{-4}$, formally), indicating +For the \textit{raw} series: $\rho_\text{obs} = 0.139$ exceeds all 10{,}000 +surrogates ($p_\text{global} < 10^{-4}$, $> 3.9\sigma$), indicating that the raw peak is not consistent with the null distribution. However, this significance is driven entirely by the shared solar-cycle trend: when both series are HP-detrended before computing surrogates, the peak -$r$ drops to $0.313$ and achieves $p_\text{global} = 0.000$ ($3.9\sigma$) --- -a marginal but nominally significant residual. -Crucially, $r(+15\,\text{d})$ after detrending is $0.041$, well within the +$r$ drops to $0.101$ (at $\tau = -125$~days) and achieves +$p_\text{global} < 10^{-4}$ ($> 3.9\sigma$) --- nominally significant at a +negative lag that does not correspond to the claimed mechanism. +Crucially, $r(+15\,\text{d})$ after HP detrending is $0.027$, well within the surrogate null distribution. \begin{figure}[htbp] @@ -659,12 +701,15 @@ surrogate null distribution. Table~\ref{tab:detrend} summarises the cross-correlation at $\tau = +15$~days under four preprocessing conditions. -The raw $r = 0.310$ falls to $0.041$ after HP filtering, to $0.110$ after STL -decomposition, and to $0.157$ after sunspot regression. -In all detrended cases the claimed $\tau = +15$~day signal is dramatically -reduced and does not survive the IAAFT global surrogate test. -The dominant peak under all detrending methods remains near $\tau \approx -125$ +The raw $r = 0.081$ falls to $0.027$ after HP filtering, to $0.030$ after STL +decomposition, and to $0.037$ after sunspot regression. +In all detrended cases the claimed $\tau = +15$~day signal is negligible. +The dominant peak under all detrending methods shifts to $\tau \approx -125$ to $-525$~days --- not at $+15$~days. +Figure~\ref{fig:detrend_robust} extends this comparison to three additional +detrending approaches (HP, Butterworth highpass, 12-month rolling-mean +subtraction) and confirms that no detrending method reveals a positive +correlation at the claimed lag (Section~\ref{sec:detrend_robust}). \begin{table}[htbp] \centering @@ -676,10 +721,10 @@ to $-525$~days --- not at $+15$~days. Preprocessing & $r(+15\,\text{d})$ & $N_\text{eff}$ & $\sigma_\text{Breth}$ & Peak $r$ & Peak $\tau$ (d) \\ \midrule - Raw (undetrended) & 0.310 & 1{,}169 & 10.9 & 0.469 & $-525$ \\ - HP filter & 0.041 & 3{,}027 & 2.3 & 0.313 & $-525$ \\ - STL decomposition & 0.110 & 1{,}880 & 4.8 & 0.155 & $-125$ \\ - Sunspot regression& 0.157 & 1{,}850 & 6.8 & 0.266 & $-525$ \\ + Raw (undetrended) & 0.081 & 2{,}916 & 4.4 & 0.139 & $-525$ \\ + HP filter & 0.027 & 3{,}199 & 1.5 & 0.101 & $-125$ \\ + STL decomposition & 0.030 & 3{,}031 & 1.6 & 0.093 & $-525$ \\ + Sunspot regression& 0.037 & 3{,}056 & 2.0 & 0.092 & $-125$ \\ \bottomrule \end{tabular} \end{table} @@ -700,6 +745,88 @@ flat function near zero, with no special feature at $+15$~days. \label{fig:detrended} \end{figure} +\subsection{Detrending Robustness} +\label{sec:detrend_robust} + +To confirm that the null result at $\tau = +15$~days does not depend on the +choice of detrending method, Figure~\ref{fig:detrend_robust} compares three +distinct approaches applied to the in-sample window: (i) the HP filter +($\lambda = 4.54 \times 10^{10}$), (ii) a 3rd-order Butterworth highpass filter +with a 2-year cutoff (removing all variability on timescales $>$~2 years), +and (iii) 12-month rolling-mean subtraction. +All three leave $r(+15\,\text{d}) < 0.04$, with no approach producing a feature +at the claimed lag. +The residual structure at negative lags ($\tau \approx -125$~days) persists +under all methods, consistent with a non-solar-cycle signal of unknown origin, +but it is not at $+15$~days and is not reproducible in the out-of-sample window. + +\begin{figure}[htbp] + \centering + \includegraphics[width=0.90\textwidth]{detrend_robustness.png} + \caption{Cross-correlation $r(\tau)$ under three detrending approaches + (HP filter, Butterworth highpass, rolling-mean subtraction) for the raw + CR index vs.\ log$_{10}$ seismic energy, 1976--2019. + The vertical grey line marks the claimed $\tau = +15$~days. + No method produces a positive feature at this lag.} + \label{fig:detrend_robust} +\end{figure} + +\subsection{Comparison of $N_\text{eff}$ Estimators} +\label{sec:neff_comparison} + +Table~\ref{tab:neff} compares the three $N_\text{eff}$ estimators described +in Section~\ref{sec:neff} for the raw in-sample series at $\tau = +15$~days. +The Bartlett first-order estimator is most liberal ($N_\text{eff} = 2{,}923$), +the Bretherton full-sum estimator substantially lower ($N_\text{eff} = 769$), +and the Monte Carlo phase-surrogate estimator most conservative +($N_\text{eff} = 594$). +The 95\% confidence interval for $r(+15\,\text{d})$ under the Monte Carlo +estimate straddles zero ($[-0.002, +0.158]$), meaning the raw correlation is +not statistically significant even before any detrending. + +\begin{table}[htbp] + \centering + \caption{Comparison of $N_\text{eff}$ estimators for raw in-sample series at + $\tau = +15$~days ($N = 3{,}214$, $r = 0.079$).} + \label{tab:neff} + \begin{tabular}{lrrr} + \toprule + Method & $N_\text{eff}$ & 95\% CI for $r$ & Includes zero? \\ + \midrule + Bartlett 1946 (first-order) & 2{,}923 & [0.042, 0.115] & No \\ + Bretherton 1999 (full sum) & 769 & [0.008, 0.148] & No \\ + Monte Carlo (phase surr.) & 594 & [$-$0.002, 0.158] & Yes \\ + \bottomrule + \end{tabular} +\end{table} + +\subsection{Magnitude Threshold Sensitivity} +\label{sec:mag_threshold} + +To assess sensitivity to the event selection threshold and potential bias from +aftershock contamination, we recomputed the seismic energy metric for three +minimum-magnitude cutoffs: $M \geq 4.5$ (baseline), $M \geq 5.0$, and +$M \geq 6.0$. +Figure~\ref{fig:mag_threshold} shows the cross-correlation function $r(\tau)$ +for each threshold. +The values at the claimed lag are $r(+15\,\text{d}) = 0.079$, $0.072$, and +$0.050$ for $M \geq 4.5$, $5.0$, and $6.0$ respectively --- a range of only +$0.029$, well below the $0.05$ threshold for a meaningful shift. +The peak lag remains at $\tau = -525$~days for all thresholds, indicating +that aftershock swarms (which inflate the $M \geq 4.5$ catalogue in the days +following large events) do not drive the dominant correlation structure. + +\begin{figure}[htbp] + \centering + \includegraphics[width=0.90\textwidth]{magnitude_threshold.png} + \caption{Cross-correlation $r(\tau)$ for three magnitude cutoffs + ($M \geq 4.5$, blue; $M \geq 5.0$, orange; $M \geq 6.0$, green), + in-sample window 1976--2019. The vertical grey line marks $\tau = +15$~days. + All three cutoffs yield similar, small correlations at the claimed lag, + and the dominant peak location ($\tau = -525$~days) is stable.} + \label{fig:mag_threshold} +\end{figure} + \subsection{Geographic Localisation} \label{sec:res:geo} @@ -749,11 +876,11 @@ in-sample period. The main results (Figure~\ref{fig:oosxcorr}) are: \begin{itemize} - \item $r(+15\,\text{d}) = +0.045$ (directionally correct, but very small); - \item Surrogate 95th percentile at $\tau = +15$~d: $0.136$ (observed is + \item $r(+15\,\text{d}) = +0.030$ (directionally correct, but very small); + \item Surrogate 95th percentile at $\tau = +15$~d: $0.101$ (observed is well below this threshold); - \item $p_\text{global} = 0.994$ --- the observed peak cross-correlation is - exceeded by 99.4\% of phase-randomisation surrogates. + \item $p_\text{global} = 0.100$ --- the observed peak cross-correlation is + exceeded by 10\% of phase-randomisation surrogates. \end{itemize} The prediction scorecard (Table~\ref{tab:prereg}) shows one pass (P1: correct @@ -767,10 +894,10 @@ across 18-month sub-windows. \centering \includegraphics[width=0.90\textwidth]{oos_xcorr.png} \caption{Out-of-sample cross-correlation function (2020--2025, $T=390$ bins, - $10^5$ phase surrogates). The observed $r(\tau)$ (black) lies entirely within + $10^5$ phase surrogates). The observed $r(\tau)$ (black) lies within the surrogate 95th-percentile envelope (grey shading). The claimed signal at - $\tau = +15$~d (vertical line) is $r = 0.045$ --- below the surrogate 95th - percentile of 0.136.} + $\tau = +15$~d (vertical line) is $r = 0.030$ --- below the surrogate 95th + percentile of 0.101.} \label{fig:oosxcorr} \end{figure} @@ -812,30 +939,31 @@ full 1976--2025 record, together with the best-fit sinusoidal envelope. \includegraphics[width=0.90\textwidth]{full_series_with_envelope_fit.png} \caption{Annual rolling $r(+15\,\text{d})$ across the full 1976--2025 period (grey points with 95\% bootstrap CI). The sinusoidal best-fit (red curve, - $P = 9.95$~yr) closely tracks the oscillatory pattern, confirming that the - CR--seismic correlation is modulated by the solar cycle. The vertical dashed - line marks the in-sample/out-of-sample split (2020).} + $P = 13.0$~yr) and the constant-mean model (dashed) are nearly indistinguishable; + BIC comparison slightly favours the constant model ($\text{BF} = 0.75$). + The vertical dashed line marks the in-sample/out-of-sample split (2020).} \label{fig:combined} \end{figure} -The global surrogate test on the full 1976--2025 window yields $p = 0.039$ -($\sigma = 2.06$) at the dominant peak $\tau = -125$~days --- marginally +The global surrogate test on the full 1976--2025 window yields $p = 0.010$ +($\sigma = 2.57$) at the dominant peak $\tau = -125$~days --- marginally significant, but at a lag inconsistent with the claimed $+15$~day CR precursor. -The sinusoidal fit (Equations~\ref{eq:modA}--\ref{eq:bf}) strongly prefers -$\mathcal{M}_B$ over $\mathcal{M}_A$: +The sinusoidal fit (Equations~\ref{eq:modA}--\ref{eq:bf}) does \emph{not} prefer +$\mathcal{M}_B$ over $\mathcal{M}_A$ with the corrected seismic metric: \begin{itemize} - \item Best-fit period: $P = 9.95 \pm 0.5$~years; - \item Amplitude: $A = 0.147$; - \item $\Delta\text{BIC} = 6.62$ (positive = $\mathcal{M}_B$ preferred); - \item Bayes factor: $\text{BF}_{BA} = 27.5$. + \item Best-fit period: $P = 13.0$~years; + \item Amplitude: $A = 0.047$; + \item $\Delta\text{BIC} = -0.57$ (negative = $\mathcal{M}_A$ constant preferred); + \item Bayes factor: $\text{BF}_{BA} = 0.75$. \end{itemize} -A Bayes factor of 27.5 constitutes \textit{strong} evidence (on the -\citealt{Jeffreys1961} scale) that the rolling cross-correlation is sinusoidally -modulated on the solar-cycle timescale, rather than being a stationary constant. -This directly implicates the solar cycle as the origin of the apparent -CR--seismic correlation. +A Bayes factor of 0.75 is less than 1.0, indicating that the constant (no +modulation) model is marginally preferred over the sinusoidal model. +With the correct seismic energy metric, the oscillatory rolling cross-correlation +previously attributed to solar-cycle modulation is no longer strongly supported; +the apparent modulation in the old results was partly an artefact of the invalid +summed-$M_W$ metric, which amplified the solar-cycle confound. %%%% ═══════════════════════════════════════════════════════════════════════════ \section{Discussion} @@ -843,11 +971,14 @@ CR--seismic correlation. \subsection{Why Does the Raw Correlation Appear So Strong?} -The raw $r = 0.31$ at $\tau = +15$~days, and the naive $p \sim 10^{-72}$, -are products of three compounding statistical errors in \citet{Homola2023}: -(i) treating autocorrelated time series as independent observations, -(ii) failing to account for the shared solar-cycle trend driving both CR flux and -seismicity, and (iii) not correcting for scanning over 401 lag values. +The raw $r \approx 0.31$ reported by \citet{Homola2023} at $\tau = +15$~days +(reduced to $r = 0.081$ with the correct seismic energy metric) and the naive +$p \sim 10^{-72}$ are products of compounding statistical errors: +(i) using a physically invalid seismic metric (direct sum of logarithmic $M_W$ +values), which artificially inflated the solar-cycle variation in the seismic series, +(ii) treating autocorrelated time series as independent observations, +(iii) failing to account for the shared solar-cycle trend driving both CR flux and +seismicity, and (iv) not correcting for scanning over 401 lag values. The solar cycle is the key confounder. During solar minimum, the heliospheric magnetic field weakens, allowing more @@ -910,25 +1041,30 @@ numbers. Our principal findings are: \begin{enumerate} - \item The raw cross-correlation $r(+15\,\text{d}) = 0.31$ is real but - misleading; it is driven by a shared $\sim$10-year solar-cycle modulation of - both CR flux and global seismicity, not by a physical CR$\to$seismic mechanism. + \item The raw cross-correlation $r(+15\,\text{d}) = 0.081$ (corrected seismic + energy metric) is modest and misleading; it is driven by a shared + $\sim$10-year solar-cycle modulation of both CR flux and global seismicity, + not by a physical CR$\to$seismic mechanism. + The larger value ($r \approx 0.31$) reported by \citet{Homola2023} results + from the physically invalid practice of directly summing logarithmic $M_W$ + values rather than the underlying energies. - \item After solar-cycle detrending, $r(+15\,\text{d})$ falls to $0.04$ (HP), - $0.11$ (STL), or $0.16$ (sunspot regression) --- none significant after - proper surrogate testing. + \item After solar-cycle detrending, $r(+15\,\text{d})$ falls to $0.027$ (HP), + $0.030$ (STL), or $0.037$ (sunspot regression) --- all within the surrogate + null distribution and consistent with zero. \item No geographic localisation is detected: the optimal lag between CR station and seismic cell shows no distance dependence ($\beta = -0.45$~d/1000~km, $p = 0.21$), inconsistent with a local propagation mechanism. \item A pre-registered out-of-sample test on 2020--2025 yields - $r(+15\,\text{d}) = 0.045$ and $p_\text{global} = 0.994$, entirely + $r(+15\,\text{d}) = 0.030$ and $p_\text{global} = 0.100$, entirely consistent with noise. - \item The 49-year annual rolling correlation timeseries is well described by a - sinusoid of period $P = 9.95$~years (Bayes factor 27.5 vs.\ constant), - confirming solar-cycle modulation. + \item The 49-year annual rolling correlation timeseries is not significantly + better described by a sinusoid than a constant (Bayes factor 0.75, favoring + constant); the previous sinusoidal modulation finding was an artefact of the + invalid seismic metric. \end{enumerate} We conclude that there is no statistically credible evidence for a physical diff --git a/paper/refs.bib b/paper/refs.bib index 78f940f..e8c8c2b 100644 --- a/paper/refs.bib +++ b/paper/refs.bib @@ -185,3 +185,38 @@ year = {2024}, howpublished = {\url{https://www.nmdb.eu}}, } + +@article{Kanamori1977, + author = {Kanamori, Hiroo}, + title = {The energy release in great earthquakes}, + journal = {Journal of Geophysical Research}, + year = {1977}, + volume = {82}, + number = {20}, + pages = {2981--2987}, + doi = {10.1029/JB082i020p02981}, +} + +@article{Bartlett1946, + author = {Bartlett, M. S.}, + title = {On the theoretical specification and sampling properties of + autocorrelated time-series}, + journal = {Journal of the Royal Statistical Society (Supplement)}, + year = {1946}, + volume = {8}, + number = {1}, + pages = {27--41}, + doi = {10.2307/2983611}, +} + +@article{RahnUhlig2002, + author = {Ravn, Morten O. and Uhlig, Harald}, + title = {On adjusting the {Hodrick--Prescott} filter for the frequency + of observations}, + journal = {Review of Economics and Statistics}, + year = {2002}, + volume = {84}, + number = {2}, + pages = {371--376}, + doi = {10.1162/003465302317411604}, +} diff --git a/results/combined_analysis.json b/results/combined_analysis.json index 7ff0765..d13b08a 100644 --- a/results/combined_analysis.json +++ b/results/combined_analysis.json @@ -2,345 +2,366 @@ "study_start": "1976-01-01", "study_end": "2025-04-29", "n_surrogates": 10000, - "gpu_device": "Tesla M40 (12.0 GB)", + "gpu_device": "CuPy not installed", "roll_window_bins": 219, "roll_step_bins": 73, - "p_global_full": 0.0391, - "sigma_full": 2.063, + "p_global_full": 0.0102, + "sigma_full": 2.569, "peak_lag_full": -125, - "T_full": 3215, - "p_global_insample": 0.0394, - "sigma_insample": 2.06, + "T_full": 3604, + "p_global_insample": 0.001, + "sigma_insample": 3.291, "peak_lag_insample": -125, - "p_global_oos": null, - "sigma_oos": null, - "peak_lag_oos": null, + "p_global_oos": 0.1053, + "sigma_oos": 1.62, + "peak_lag_oos": 125, "sinusoid_fit": { "status": "ok", - "n": 44, - "model_A_mu": 0.03810137537027485, - "model_A_rss": 1.4250059377605158, - "model_A_bic": -147.13641111378288, - "model_B_A": 0.14697276976210524, - "model_B_period": 9.951134330535053, - "model_B_phi": 4.406952670096151, - "model_B_mu": 0.04805217589705477, - "model_B_rss": 0.9470531815602373, - "model_B_bic": -153.7611866521916, - "delta_bic": 6.624775538408727, - "bayes_factor": 27.45059296068454, - "preferred_model": "B (sinusoidal)" + "n": 47, + "model_A_mu": 0.022200546083868917, + "model_A_rss": 0.2586849065021236, + "model_A_bic": -240.65758282955105, + "model_B_A": 0.04731298817282213, + "model_B_period": 12.999999999999998, + "model_B_phi": 3.427627278577327, + "model_B_mu": 0.021090832513002176, + "model_B_rss": 0.20479478412649416, + "model_B_bic": -240.08644902438496, + "delta_bic": -0.5711338051660846, + "bayes_factor": 0.7515880563139137, + "preferred_model": "A (constant)" }, "roster": {}, - "n_rolling_windows": 44, + "n_rolling_windows": 47, "rolling_windows": [ { "center_date": "1977-06-29", "center_year": 1977.5, - "r": -0.07304197565233789, - "n_pairs": 216, - "n_eff": 167.5 - }, - { - "center_date": "1978-06-29", - "center_year": 1978.5, - "r": -0.08997276226742033, - "n_pairs": 216, - "n_eff": 157.9 - }, - { - "center_date": "1979-06-29", - "center_year": 1979.5, - "r": -0.10064371974119339, - "n_pairs": 216, - "n_eff": 142.1 - }, - { - "center_date": "1980-06-28", - "center_year": 1980.5, - "r": 0.1387293480900925, - "n_pairs": 216, - "n_eff": 162.4 - }, - { - "center_date": "1981-06-28", - "center_year": 1981.5, - "r": 0.12399808185165744, - "n_pairs": 216, - "n_eff": 163.5 - }, - { - "center_date": "1982-06-28", - "center_year": 1982.5, - "r": 0.05498839726187148, - "n_pairs": 216, - "n_eff": 135.1 - }, - { - "center_date": "1983-06-28", - "center_year": 1983.5, - "r": 0.08625587610195634, - "n_pairs": 216, - "n_eff": 131.9 - }, - { - "center_date": "1984-06-27", - "center_year": 1984.5, - "r": 0.19501846317350097, - "n_pairs": 216, - "n_eff": 126.7 - }, - { - "center_date": "1985-06-27", - "center_year": 1985.5, - "r": 0.26660955823518667, - "n_pairs": 216, - "n_eff": 129.9 - }, - { - "center_date": "1986-06-27", - "center_year": 1986.5, - "r": 0.053060019008078135, - "n_pairs": 216, - "n_eff": 141.9 - }, - { - "center_date": "1987-06-27", - "center_year": 1987.5, - "r": 0.12259905609503514, - "n_pairs": 216, - "n_eff": 138.7 - }, - { - "center_date": "1988-06-26", - "center_year": 1988.5, - "r": 0.058069494988180585, - "n_pairs": 216, - "n_eff": 151.6 - }, - { - "center_date": "1989-06-26", - "center_year": 1989.5, - "r": -0.12715297356977961, - "n_pairs": 216, - "n_eff": 150.3 - }, - { - "center_date": "1990-06-26", - "center_year": 1990.5, - "r": 0.015330129929657415, - "n_pairs": 216, - "n_eff": 150.8 - }, - { - "center_date": "1991-06-26", - "center_year": 1991.5, - "r": 0.19374509216197436, - "n_pairs": 216, - "n_eff": 139.3 - }, - { - "center_date": "1992-06-25", - "center_year": 1992.5, - "r": 0.22639916082796394, - "n_pairs": 216, - "n_eff": 122.1 - }, - { - "center_date": "1993-06-25", - "center_year": 1993.5, - "r": -0.030154303926572024, - "n_pairs": 216, - "n_eff": 118.3 - }, - { - "center_date": "1994-06-25", - "center_year": 1994.5, - "r": 0.4464462711030149, - "n_pairs": 216, - "n_eff": 92.0 - }, - { - "center_date": "1995-06-25", - "center_year": 1995.5, - "r": 0.46846538331881726, - "n_pairs": 216, - "n_eff": 101.1 - }, - { - "center_date": "1996-06-24", - "center_year": 1996.5, - "r": 0.0010884593495063466, - "n_pairs": 216, - "n_eff": 155.3 - }, - { - "center_date": "1997-06-24", - "center_year": 1997.5, - "r": 0.13927312030650843, - "n_pairs": 216, - "n_eff": 124.8 - }, - { - "center_date": "1998-06-24", - "center_year": 1998.5, - "r": 0.09462289865462623, - "n_pairs": 216, - "n_eff": 102.6 - }, - { - "center_date": "1999-06-24", - "center_year": 1999.5, - "r": -0.3074029329895098, - "n_pairs": 216, - "n_eff": 102.0 - }, - { - "center_date": "2000-06-23", - "center_year": 2000.5, - "r": -0.2610928794450835, - "n_pairs": 216, - "n_eff": 115.0 - }, - { - "center_date": "2001-06-23", - "center_year": 2001.5, - "r": -0.1904780542634531, - "n_pairs": 216, - "n_eff": 129.8 - }, - { - "center_date": "2002-06-23", - "center_year": 2002.5, - "r": -0.13152989315486557, - "n_pairs": 216, - "n_eff": 152.3 - }, - { - "center_date": "2003-06-23", - "center_year": 2003.5, - "r": 0.30074616915588975, - "n_pairs": 216, - "n_eff": 103.4 - }, - { - "center_date": "2004-06-22", - "center_year": 2004.5, - "r": 0.31194289965738514, - "n_pairs": 216, - "n_eff": 90.4 - }, - { - "center_date": "2005-06-22", - "center_year": 2005.5, - "r": 0.12462302910590135, - "n_pairs": 216, - "n_eff": 120.2 - }, - { - "center_date": "2006-06-22", - "center_year": 2006.5, - "r": -0.08356145006599772, - "n_pairs": 216, - "n_eff": 120.3 - }, - { - "center_date": "2007-06-22", - "center_year": 2007.5, - "r": 0.0304484443288193, - "n_pairs": 216, - "n_eff": 129.9 - }, - { - "center_date": "2008-06-21", - "center_year": 2008.5, - "r": -0.44128635766757796, - "n_pairs": 216, - "n_eff": 88.7 - }, - { - "center_date": "2009-06-21", - "center_year": 2009.5, - "r": -0.2423228521602404, - "n_pairs": 216, - "n_eff": 134.2 - }, - { - "center_date": "2010-06-21", - "center_year": 2010.5, - "r": -0.1553791297352043, - "n_pairs": 216, - "n_eff": 78.8 - }, - { - "center_date": "2011-06-21", - "center_year": 2011.5, - "r": 0.06528490893846879, + "r": -0.020682998413924136, "n_pairs": 216, "n_eff": 219.0 }, + { + "center_date": "1978-06-29", + "center_year": 1978.5, + "r": -0.051889253000038056, + "n_pairs": 216, + "n_eff": 219.0 + }, + { + "center_date": "1979-06-29", + "center_year": 1979.5, + "r": -0.01921729041960269, + "n_pairs": 216, + "n_eff": 219.0 + }, + { + "center_date": "1980-06-28", + "center_year": 1980.5, + "r": -0.007971832729398727, + "n_pairs": 216, + "n_eff": 219.0 + }, + { + "center_date": "1981-06-28", + "center_year": 1981.5, + "r": -0.05199049887114494, + "n_pairs": 216, + "n_eff": 212.3 + }, + { + "center_date": "1982-06-28", + "center_year": 1982.5, + "r": 0.10990844822409601, + "n_pairs": 216, + "n_eff": 219.0 + }, + { + "center_date": "1983-06-28", + "center_year": 1983.5, + "r": 0.1429920478608218, + "n_pairs": 216, + "n_eff": 217.1 + }, + { + "center_date": "1984-06-27", + "center_year": 1984.5, + "r": 0.050761730612123024, + "n_pairs": 216, + "n_eff": 199.9 + }, + { + "center_date": "1985-06-27", + "center_year": 1985.5, + "r": 0.0456129881465578, + "n_pairs": 216, + "n_eff": 166.1 + }, + { + "center_date": "1986-06-27", + "center_year": 1986.5, + "r": 0.07310386912651616, + "n_pairs": 216, + "n_eff": 154.7 + }, + { + "center_date": "1987-06-27", + "center_year": 1987.5, + "r": 0.10618931743758343, + "n_pairs": 216, + "n_eff": 172.2 + }, + { + "center_date": "1988-06-26", + "center_year": 1988.5, + "r": 0.08989052287355462, + "n_pairs": 216, + "n_eff": 187.0 + }, + { + "center_date": "1989-06-26", + "center_year": 1989.5, + "r": -0.08013985433574099, + "n_pairs": 216, + "n_eff": 208.0 + }, + { + "center_date": "1990-06-26", + "center_year": 1990.5, + "r": -0.03529477575460591, + "n_pairs": 216, + "n_eff": 204.5 + }, + { + "center_date": "1991-06-26", + "center_year": 1991.5, + "r": 0.05458287950803276, + "n_pairs": 216, + "n_eff": 182.7 + }, + { + "center_date": "1992-06-25", + "center_year": 1992.5, + "r": 0.050699594030746836, + "n_pairs": 216, + "n_eff": 162.8 + }, + { + "center_date": "1993-06-25", + "center_year": 1993.5, + "r": 0.03883996430021009, + "n_pairs": 216, + "n_eff": 163.8 + }, + { + "center_date": "1994-06-25", + "center_year": 1994.5, + "r": 0.166371106472413, + "n_pairs": 216, + "n_eff": 183.8 + }, + { + "center_date": "1995-06-25", + "center_year": 1995.5, + "r": 0.06830210990387046, + "n_pairs": 216, + "n_eff": 172.0 + }, + { + "center_date": "1996-06-24", + "center_year": 1996.5, + "r": 0.030066510290482355, + "n_pairs": 216, + "n_eff": 199.4 + }, + { + "center_date": "1997-06-24", + "center_year": 1997.5, + "r": 0.11235882388557221, + "n_pairs": 216, + "n_eff": 196.4 + }, + { + "center_date": "1998-06-24", + "center_year": 1998.5, + "r": -0.0334249972492551, + "n_pairs": 216, + "n_eff": 219.0 + }, + { + "center_date": "1999-06-24", + "center_year": 1999.5, + "r": -0.11540484194935417, + "n_pairs": 216, + "n_eff": 218.8 + }, + { + "center_date": "2000-06-23", + "center_year": 2000.5, + "r": -0.10826164962916013, + "n_pairs": 216, + "n_eff": 219.0 + }, + { + "center_date": "2001-06-23", + "center_year": 2001.5, + "r": -0.09874152655673563, + "n_pairs": 216, + "n_eff": 219.0 + }, + { + "center_date": "2002-06-23", + "center_year": 2002.5, + "r": -0.09629510392107031, + "n_pairs": 216, + "n_eff": 219.0 + }, + { + "center_date": "2003-06-23", + "center_year": 2003.5, + "r": -0.047207383943452405, + "n_pairs": 216, + "n_eff": 219.0 + }, + { + "center_date": "2004-06-22", + "center_year": 2004.5, + "r": -0.03434789680293568, + "n_pairs": 216, + "n_eff": 184.2 + }, + { + "center_date": "2005-06-22", + "center_year": 2005.5, + "r": 0.007584688858697782, + "n_pairs": 216, + "n_eff": 188.7 + }, + { + "center_date": "2006-06-22", + "center_year": 2006.5, + "r": 0.07308247579780355, + "n_pairs": 216, + "n_eff": 204.6 + }, + { + "center_date": "2007-06-22", + "center_year": 2007.5, + "r": 0.1326749965769166, + "n_pairs": 216, + "n_eff": 197.0 + }, + { + "center_date": "2008-06-21", + "center_year": 2008.5, + "r": -0.0006151850346002505, + "n_pairs": 216, + "n_eff": 174.1 + }, + { + "center_date": "2009-06-21", + "center_year": 2009.5, + "r": 0.04490155677049055, + "n_pairs": 216, + "n_eff": 189.5 + }, + { + "center_date": "2010-06-21", + "center_year": 2010.5, + "r": -0.005011299243415827, + "n_pairs": 216, + "n_eff": 192.5 + }, + { + "center_date": "2011-06-21", + "center_year": 2011.5, + "r": 0.14338453167241152, + "n_pairs": 216, + "n_eff": 217.9 + }, { "center_date": "2012-06-20", "center_year": 2012.5, - "r": 0.08690195508067765, + "r": 0.15476867119234552, "n_pairs": 216, - "n_eff": 214.9 + "n_eff": 216.1 }, { "center_date": "2013-06-20", "center_year": 2013.5, - "r": 0.07842413402898289, + "r": 0.14106791134776545, "n_pairs": 216, - "n_eff": 216.0 + "n_eff": 216.3 }, { "center_date": "2014-06-20", "center_year": 2014.5, - "r": 0.02865034198205194, + "r": -0.03969757426775039, "n_pairs": 216, - "n_eff": 199.3 + "n_eff": 210.6 }, { "center_date": "2015-06-20", "center_year": 2015.5, - "r": 0.03395144204160391, + "r": -0.031703141997517933, "n_pairs": 216, - "n_eff": 193.1 + "n_eff": 217.7 }, { "center_date": "2016-06-19", "center_year": 2016.5, - "r": -0.030372822325838798, + "r": -0.024743836744672402, "n_pairs": 216, - "n_eff": 107.1 + "n_eff": 214.6 }, { "center_date": "2017-06-19", "center_year": 2017.5, - "r": 0.06681413442428236, + "r": 0.04316798990535341, "n_pairs": 216, - "n_eff": 156.2 + "n_eff": 211.8 }, { "center_date": "2018-06-19", "center_year": 2018.5, - "r": 0.0195590406606907, + "r": -0.030478585719267703, "n_pairs": 216, - "n_eff": 176.0 + "n_eff": 214.3 }, { "center_date": "2019-06-19", "center_year": 2019.5, - "r": 0.056103936327729184, - "n_pairs": 146, - "n_eff": 130.2 + "r": -0.0030968330313544045, + "n_pairs": 216, + "n_eff": 212.3 }, { "center_date": "2020-06-18", "center_year": 2020.5, - "r": 0.05270337706705674, - "n_pairs": 73, - "n_eff": 68.2 + "r": -0.018753325801192815, + "n_pairs": 216, + "n_eff": 214.5 + }, + { + "center_date": "2021-06-18", + "center_year": 2021.5, + "r": 0.036783978789427865, + "n_pairs": 216, + "n_eff": 182.9 + }, + { + "center_date": "2022-06-18", + "center_year": 2022.5, + "r": -0.004418064483557494, + "n_pairs": 216, + "n_eff": 194.1 + }, + { + "center_date": "2023-06-18", + "center_year": 2023.5, + "r": 0.08571670225779435, + "n_pairs": 216, + "n_eff": 207.0 } ] } \ No newline at end of file diff --git a/results/detrended_results.json b/results/detrended_results.json index 04f1e8b..1766573 100644 --- a/results/detrended_results.json +++ b/results/detrended_results.json @@ -1,5 +1,5 @@ { - "generated": "2026-04-21T10:31:37Z", + "generated": "2026-04-24T12:15:00Z", "study_start": "1976-01-01", "study_end": "2019-12-31", "bin_days": 5, @@ -8,52 +8,52 @@ { "label": "Raw", "n": 3215, - "n_eff": 1169.0, - "r_at_15d": 0.3099, - "sigma_15_naive": 18.01, - "sigma_15_breth": 10.85, - "peak_r": 0.4691, + "n_eff": 2915.7, + "r_at_15d": 0.0815, + "sigma_15_naive": 4.63, + "sigma_15_breth": 4.41, + "peak_r": 0.1386, "peak_lag_days": -525, - "sigma_pk_naive": 28.26, - "p_global_iaaft": 1.0, - "sigma_iaaft": 0.0 + "sigma_pk_naive": 7.9, + "p_global_iaaft": 0.0, + "sigma_iaaft": 3.89 }, { "label": "HP filter", "n": 3215, - "n_eff": 3027.3, - "r_at_15d": 0.0411, - "sigma_15_naive": 2.33, - "sigma_15_breth": 2.26, - "peak_r": 0.3131, - "peak_lag_days": -525, - "sigma_pk_naive": 18.21, + "n_eff": 3198.9, + "r_at_15d": 0.0266, + "sigma_15_naive": 1.51, + "sigma_15_breth": 1.5, + "peak_r": 0.1009, + "peak_lag_days": -125, + "sigma_pk_naive": 5.73, "p_global_iaaft": 0.0, "sigma_iaaft": 3.89 }, { "label": "STL", "n": 3215, - "n_eff": 1879.8, - "r_at_15d": 0.1098, - "sigma_15_naive": 6.24, - "sigma_15_breth": 4.77, - "peak_r": 0.1554, - "peak_lag_days": -125, - "sigma_pk_naive": 8.86, + "n_eff": 3030.8, + "r_at_15d": 0.0296, + "sigma_15_naive": 1.68, + "sigma_15_breth": 1.63, + "peak_r": 0.0934, + "peak_lag_days": -525, + "sigma_pk_naive": 5.3, "p_global_iaaft": 0.0, "sigma_iaaft": 3.89 }, { "label": "Sunspot regression", "n": 3215, - "n_eff": 1849.6, - "r_at_15d": 0.157, - "sigma_15_naive": 8.96, - "sigma_15_breth": 6.79, - "peak_r": 0.2657, - "peak_lag_days": -525, - "sigma_pk_naive": 15.34, + "n_eff": 3055.9, + "r_at_15d": 0.0368, + "sigma_15_naive": 2.08, + "sigma_15_breth": 2.03, + "peak_r": 0.0919, + "peak_lag_days": -125, + "sigma_pk_naive": 5.22, "p_global_iaaft": 0.0, "sigma_iaaft": 3.89 } diff --git a/results/figs/detrend_robustness.png b/results/figs/detrend_robustness.png new file mode 100644 index 0000000..b3fb0ca Binary files /dev/null and b/results/figs/detrend_robustness.png differ diff --git a/results/figs/detrended_xcorr.png b/results/figs/detrended_xcorr.png index 089bd2c..f93870a 100644 Binary files a/results/figs/detrended_xcorr.png and b/results/figs/detrended_xcorr.png differ diff --git a/results/figs/full_series_with_envelope_fit.png b/results/figs/full_series_with_envelope_fit.png index b1b9867..c1117d7 100644 Binary files a/results/figs/full_series_with_envelope_fit.png and b/results/figs/full_series_with_envelope_fit.png differ diff --git a/results/figs/homola_replication.png b/results/figs/homola_replication.png index c2c26d6..317e379 100644 Binary files a/results/figs/homola_replication.png and b/results/figs/homola_replication.png differ diff --git a/results/figs/magnitude_threshold.png b/results/figs/magnitude_threshold.png new file mode 100644 index 0000000..fc336db Binary files /dev/null and b/results/figs/magnitude_threshold.png differ diff --git a/results/figs/oos_xcorr.png b/results/figs/oos_xcorr.png index 17a5e2d..5d92e43 100644 Binary files a/results/figs/oos_xcorr.png and b/results/figs/oos_xcorr.png differ diff --git a/results/figs/rolling_correlation_oos.png b/results/figs/rolling_correlation_oos.png index 59f3044..93b32de 100644 Binary files a/results/figs/rolling_correlation_oos.png and b/results/figs/rolling_correlation_oos.png differ diff --git a/results/homola_replication.json b/results/homola_replication.json index b887395..14daf29 100644 --- a/results/homola_replication.json +++ b/results/homola_replication.json @@ -1,8 +1,8 @@ { "script": "scripts/02_homola_replication.py", - "git_sha": "unknown", + "git_sha": "817d7ba", "seed": 42, - "timestamp_utc": "2026-04-21T09:02:19.401598+00:00", + "timestamp_utc": "2026-04-24T12:02:57.512719+00:00", "study_window": [ "1976-01-01", "2019-12-31" @@ -22,13 +22,13 @@ "cr_data_coverage_pct": 100.03, "peak_lag_days": -525, "peak_lag_bins": -105, - "peak_r": 0.469101, - "naive_p_value": 5.631233e-170, - "naive_sigma": 27.791, + "peak_r": 0.138646, + "naive_p_value": 8.079288e-15, + "naive_sigma": 7.766, "n_pairs_at_peak": 3110, - "r_at_tau_15d": 0.309878, - "p_at_tau_15d": 1.93927e-72, - "sigma_at_tau_15d": 18.0, + "r_at_tau_15d": 0.08148, + "p_at_tau_15d": 3.768669e-06, + "sigma_at_tau_15d": 4.624, "n_lags_scanned": 401, "note_significance": "Naive p from Pearson t-test treating 5-day bins as i.i.d. Not corrected for: (1) temporal autocorrelation, (2) shared solar-cycle trend, (3) scan over multiple lags. Expected to be grossly over-significant \u2014 corrected tests in phase 2." } \ No newline at end of file diff --git a/results/magnitude_sensitivity.json b/results/magnitude_sensitivity.json new file mode 100644 index 0000000..1db2030 --- /dev/null +++ b/results/magnitude_sensitivity.json @@ -0,0 +1,20 @@ +{ + "M_ge_4.5": { + "r_at_tau_plus15d": 0.0786, + "peak_r": 0.1349, + "peak_lag_days": -525, + "frac_empty_bins_pct": 0.0 + }, + "M_ge_5.0": { + "r_at_tau_plus15d": 0.0722, + "peak_r": 0.1262, + "peak_lag_days": -525, + "frac_empty_bins_pct": 0.0 + }, + "M_ge_6.0": { + "r_at_tau_plus15d": 0.0495, + "peak_r": 0.1104, + "peak_lag_days": -525, + "frac_empty_bins_pct": 19.7 + } +} \ No newline at end of file diff --git a/results/neff_comparison.json b/results/neff_comparison.json new file mode 100644 index 0000000..ac794c4 --- /dev/null +++ b/results/neff_comparison.json @@ -0,0 +1,30 @@ +{ + "N_raw": 3214, + "r_at_tau_plus15d": 0.07862, + "methods": { + "Bartlett_1946_first_order": { + "Neff": 2923.1, + "CI_95_r": [ + 0.04248258115478359, + 0.11454542812932819 + ], + "note": "N\u00b7(1\u2212\u03c1\u2081\u2093\u03c1\u2081\u1d67)/(1+\u03c1\u2081\u2093\u03c1\u2081\u1d67) [current implementation]" + }, + "Bretherton_1999_full_sum": { + "Neff": 769.4, + "CI_95_r": [ + 0.007981897773749757, + 0.14847087395884573 + ], + "note": "N / (1 + 2\u03a3_k \u03c1_xx(k)\u03c1_yy(k)) K=200 lags" + }, + "Monte_Carlo_phase_surrogates": { + "Neff": 593.8, + "CI_95_r": [ + -0.0018607729923741843, + 0.15808238828278748 + ], + "note": "1/Var(r_null) from 1000 phase-randomised y series" + } + } +} \ No newline at end of file diff --git a/results/out_of_sample_metrics.json b/results/out_of_sample_metrics.json index 727a872..8669288 100644 --- a/results/out_of_sample_metrics.json +++ b/results/out_of_sample_metrics.json @@ -1,22 +1,22 @@ { - "git_sha": "unknown", - "avail_run_date": "unknown", + "git_sha": "817d7ba", + "avail_run_date": "2026-04-22", "study_start": "2020-01-01", "study_end": "2025-04-29", "T_valid": 390, - "n_stations": 35, + "n_stations": 30, "n_surrogates": 100000, "seed": 42, - "gpu_device": "Tesla M40 (12.0 GB)", - "r_at_15d": 0.044564, - "r_at_15d_hp": 0.026665, - "surr_p95_at_15d": 0.135623, - "p_global": 0.99404, - "p_global_linear_detrend": 1.0, - "sigma_surr": 0.007, - "peak_r": 0.110406, - "peak_lag_days": 135, - "n_eff": 274.5, + "gpu_device": "CuPy not installed", + "r_at_15d": 0.030385, + "r_at_15d_hp": 0.02318, + "surr_p95_at_15d": 0.101216, + "p_global": 0.1002, + "p_global_linear_detrend": 0.1364, + "sigma_surr": 1.644, + "peak_r": 0.135751, + "peak_lag_days": 125, + "n_eff": 373.2, "n_rolling_windows": 16, "n_significant_bh": 0, "expected_fp": 0.0, @@ -34,162 +34,162 @@ { "center_date": "2020-09-27", "i0": 0, - "r": -0.020673974266789276, - "r_95_lo": -0.2384207768025597, - "r_95_hi": 0.19905195096046985, - "surr_p95": 0.24703209108565127, + "r": -0.05146524626287487, + "r_95_lo": -0.2372733290885531, + "r_95_hi": 0.1379755743296592, + "surr_p95": 0.18455536527564864, "n_pairs": 106, - "n_eff": 80.7 + "n_eff": 109.0 }, { "center_date": "2020-12-26", "i0": 18, - "r": -0.1596215601323441, - "r_95_lo": -0.3652401175794433, - "r_95_hi": 0.06084746948303837, - "surr_p95": 0.2420104634210208, + "r": -0.08519551278879822, + "r_95_lo": -0.2708070529229959, + "r_95_hi": 0.10652377699228903, + "surr_p95": 0.18802328314778477, "n_pairs": 106, - "n_eff": 81.0 + "n_eff": 106.9 }, { "center_date": "2021-03-26", "i0": 36, - "r": -0.10746670672666697, - "r_95_lo": -0.3483903115845773, - "r_95_hi": 0.14677605655326095, - "surr_p95": 0.2572000321729004, + "r": -0.024616344795131185, + "r_95_lo": -0.21174098295275237, + "r_95_hi": 0.16424930129951676, + "surr_p95": 0.21866856372186502, "n_pairs": 106, - "n_eff": 61.7 + "n_eff": 109.0 }, { "center_date": "2021-06-24", "i0": 54, - "r": -0.10098858847235519, - "r_95_lo": -0.3463312883265511, - "r_95_hi": 0.1572843431029865, - "surr_p95": 0.26650154565559175, + "r": -0.016257852974678237, + "r_95_lo": -0.20402574462377127, + "r_95_hi": 0.17266378685923547, + "surr_p95": 0.2211427634545669, "n_pairs": 106, - "n_eff": 59.9 + "n_eff": 108.7 }, { "center_date": "2021-09-22", "i0": 72, - "r": 0.01254431298486686, - "r_95_lo": -0.25387482798887767, - "r_95_hi": 0.27719423163204004, - "surr_p95": 0.2985998754816451, + "r": 0.03640189161691471, + "r_95_lo": -0.15631782746213083, + "r_95_hi": 0.22645153832584944, + "surr_p95": 0.1729728182942318, "n_pairs": 106, - "n_eff": 54.9 + "n_eff": 105.0 }, { "center_date": "2021-12-21", "i0": 90, - "r": 0.09250204283123152, - "r_95_lo": -0.20439300590462595, - "r_95_hi": 0.3738122683731473, - "surr_p95": 0.27676919680091644, + "r": 0.006645023408882456, + "r_95_lo": -0.20677276273545037, + "r_95_hi": 0.2194591712425492, + "surr_p95": 0.20343367527082096, "n_pairs": 106, - "n_eff": 45.7 + "n_eff": 85.0 }, { "center_date": "2022-03-21", "i0": 108, - "r": 0.14320771523610307, - "r_95_lo": -0.13729493377312108, - "r_95_hi": 0.40244693119546243, - "surr_p95": 0.28437908737796685, + "r": 0.06352573283154293, + "r_95_lo": -0.1583562943219795, + "r_95_hi": 0.27930033312679997, + "surr_p95": 0.22702618727314533, "n_pairs": 106, - "n_eff": 51.2 + "n_eff": 80.0 }, { "center_date": "2022-06-19", "i0": 126, - "r": 0.07509703884831592, - "r_95_lo": -0.13160095985883036, - "r_95_hi": 0.2755371203194297, - "surr_p95": 0.20138619969258406, + "r": -0.027100164278328558, + "r_95_lo": -0.22007816519070642, + "r_95_hi": 0.167919134512406, + "surr_p95": 0.17532793155067355, "n_pairs": 106, - "n_eff": 92.1 + "n_eff": 102.4 }, { "center_date": "2022-09-17", "i0": 144, - "r": 0.2168176251787577, - "r_95_lo": 0.011086902245728723, - "r_95_hi": 0.404937880290396, - "surr_p95": 0.26527264789585464, + "r": -0.10699604981829633, + "r_95_lo": -0.3066425251270265, + "r_95_hi": 0.10166838691938893, + "surr_p95": 0.1844732872521301, "n_pairs": 106, - "n_eff": 90.8 + "n_eff": 90.6 }, { "center_date": "2022-12-16", "i0": 162, - "r": 0.16887990508570438, - "r_95_lo": -0.034272026977641694, - "r_95_hi": 0.3586296276290144, - "surr_p95": 0.19453090531555742, + "r": -0.07888848360107797, + "r_95_lo": -0.27716415336527417, + "r_95_hi": 0.1258316197427093, + "surr_p95": 0.2388264847348992, "n_pairs": 106, - "n_eff": 94.6 + "n_eff": 93.9 }, { "center_date": "2023-03-16", "i0": 180, - "r": -0.007919126521579136, - "r_95_lo": -0.24368819753335721, - "r_95_hi": 0.22873370185387817, - "surr_p95": 0.244394067315623, + "r": -0.0007704907686197004, + "r_95_lo": -0.2019803672154244, + "r_95_hi": 0.20050179234117366, + "surr_p95": 0.15998839966310602, "n_pairs": 106, - "n_eff": 69.3 + "n_eff": 95.3 }, { "center_date": "2023-06-14", "i0": 198, - "r": -0.03593975038951865, - "r_95_lo": -0.27513801901583773, - "r_95_hi": 0.20744861668280118, - "surr_p95": 0.26934427417321, + "r": 0.06929904703818325, + "r_95_lo": -0.12992580150347988, + "r_95_hi": 0.26314554166370735, + "surr_p95": 0.20534739264470322, "n_pairs": 106, - "n_eff": 66.2 + "n_eff": 99.0 }, { "center_date": "2023-09-12", "i0": 216, - "r": 0.012229179706855697, - "r_95_lo": -0.2289007755823031, - "r_95_hi": 0.2519451642078741, - "surr_p95": 0.21335780781894043, + "r": 0.18403090660697918, + "r_95_lo": -0.008959519602586617, + "r_95_hi": 0.3638039817392954, + "surr_p95": 0.17975161390753824, "n_pairs": 106, - "n_eff": 66.9 + "n_eff": 103.9 }, { "center_date": "2023-12-11", "i0": 234, - "r": 0.06543567665343912, - "r_95_lo": -0.1901695876704142, - "r_95_hi": 0.3127329265694898, - "surr_p95": 0.2519767306793838, + "r": 0.19843646031605602, + "r_95_lo": 0.003184073430459692, + "r_95_hi": 0.37911415499755813, + "surr_p95": 0.2041415037100962, "n_pairs": 106, - "n_eff": 60.7 + "n_eff": 101.1 }, { "center_date": "2024-03-10", "i0": 252, - "r": 0.07906053491790431, - "r_95_lo": -0.1373503142101981, - "r_95_hi": 0.2882674790685488, - "surr_p95": 0.19634784346737047, + "r": 0.16708140560904264, + "r_95_lo": -0.021705881888515865, + "r_95_hi": 0.3443635468561599, + "surr_p95": 0.19730403010824565, "n_pairs": 106, - "n_eff": 84.2 + "n_eff": 109.0 }, { "center_date": "2024-06-08", "i0": 270, - "r": 0.029537138940926456, - "r_95_lo": -0.19206747741251445, - "r_95_hi": 0.24827571906628718, - "surr_p95": 0.1950217577409729, + "r": 0.05455744825916992, + "r_95_lo": -0.13532003457768557, + "r_95_hi": 0.24056954124476196, + "surr_p95": 0.16195572741315192, "n_pairs": 106, - "n_eff": 79.5 + "n_eff": 108.6 } ] } \ No newline at end of file diff --git a/scripts/02_homola_replication.py b/scripts/02_homola_replication.py index aa678f1..a98f84f 100644 --- a/scripts/02_homola_replication.py +++ b/scripts/02_homola_replication.py @@ -52,7 +52,7 @@ PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT / "src")) from crq.ingest.nmdb import load_station, resample_daily -from crq.ingest.usgs import load_usgs +from crq.ingest.usgs import load_usgs, seismic_energy_per_bin logging.basicConfig( level=logging.INFO, @@ -268,21 +268,23 @@ def build_seismic_metric( len(events), min_mag, study_start, study_end, ) - # Daily sum-of-Mw then bin to 5-day totals (sum, not mean) - daily_mw = events["mag"].resample("1D").sum() - daily_mw = daily_mw.reindex( - pd.date_range(study_start, study_end, freq="D"), fill_value=0.0 + # Physically correct: E = 10^(1.5·Mw + 4.8) J per event, summed per bin, + # returned as log10(E_sum). See Kanamori (1977). + t0 = pd.Timestamp(study_start) + seismic = seismic_energy_per_bin( + events, study_start, study_end, bin_days, t0, min_mag=min_mag, ) - seismic = _bin_series(daily_mw, study_start, bin_days, agg="sum").fillna(0.0) - seismic.name = "seismic_mw_sum" + seismic = seismic.fillna(seismic.min()) # fill rare empty bins with floor # Align to CR index bins if cr_index is not None: - seismic = seismic.reindex(cr_index.index, fill_value=0.0) + seismic = seismic.reindex(cr_index.index, fill_value=seismic.min()) - nonzero_pct = 100 * (seismic > 0).mean() + nonnan_pct = 100 * seismic.notna().mean() logger.info( - "Seismic metric: %d bins, %.1f%% non-zero", len(seismic), nonzero_pct + "Seismic metric (log10 E): %d bins, %.1f%% non-NaN, " + "range [%.1f, %.1f] log10(J)", + len(seismic), nonnan_pct, float(seismic.min()), float(seismic.max()), ) return seismic @@ -471,8 +473,12 @@ def make_figure( # ── Panel 3: seismic metric ──────────────────────────────────────────── ax3 = axes[2] ax3.fill_between(seismic_5d.index, seismic_5d.values, color="firebrick", alpha=0.6, lw=0) - ax3.set_ylabel(f"Σ Mw (M≥{MIN_MAG:.1f})") - ax3.set_title(f"Global seismic metric (sum of all Mw in each 5-day bin)", fontsize=9) + ax3.set_ylabel(f"log₁₀(Σ E) [log₁₀ J] (M≥{MIN_MAG:.1f})") + ax3.set_title( + f"Global seismic metric (log₁₀ of summed energy per 5-day bin, " + f"E=10^(1.5·Mw+4.8) J)", + fontsize=9, + ) ax3.grid(True, alpha=0.18) ax3.set_xlim(seismic_5d.index[0], seismic_5d.index[-1]) diff --git a/scripts/07_out_of_sample.py b/scripts/07_out_of_sample.py index 0bafd33..0c2b7db 100644 --- a/scripts/07_out_of_sample.py +++ b/scripts/07_out_of_sample.py @@ -60,7 +60,7 @@ PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT / "src")) from crq.ingest.nmdb import load_station, resample_daily -from crq.ingest.usgs import load_usgs +from crq.ingest.usgs import load_usgs, seismic_energy_per_bin from crq.stats.surrogates import ( surrogate_xcorr_test, n_eff_bretherton, @@ -336,19 +336,18 @@ def load_cr_and_seismic( cr_bin = _bin_series(global_daily, study_start, bin_days).reindex(ref_index) - # --- Seismic metric --- + # --- Seismic metric (log10 summed seismic energy, E = 10^(1.5·Mw+4.8) J) --- events = load_usgs(start_year, end_year, usgs_dir) - full_day_index = pd.date_range(study_start, study_end, freq="D") if events.empty or not isinstance(events.index, pd.DatetimeIndex): - logger.warning("No USGS events loaded for %s–%s; seismic series will be zeros", study_start, study_end) - daily_mw = pd.Series(0.0, index=full_day_index) + logger.warning("No USGS events for %s–%s; seismic metric will be NaN", study_start, study_end) + seismic_bin = pd.Series(np.nan, index=ref_index) else: - events = events.loc[study_start:study_end] - events = events[events["mag"] >= min_mag] - daily_mw = events["mag"].resample("1D").sum() - daily_mw = daily_mw.reindex(full_day_index, fill_value=0.0) - seismic_bin = _bin_series(daily_mw, study_start, bin_days, agg="sum").fillna(0.0) - seismic_bin = seismic_bin.reindex(ref_index, fill_value=0.0) + t0_bin = pd.Timestamp(study_start) + seismic_bin = seismic_energy_per_bin( + events, study_start, study_end, bin_days, t0_bin, min_mag=min_mag, + ) + floor_val = float(seismic_bin.min()) + seismic_bin = seismic_bin.reindex(ref_index, fill_value=floor_val) # Align on common non-NaN index common = cr_bin.index.intersection(seismic_bin.index) diff --git a/scripts/08_combined_timeseries.py b/scripts/08_combined_timeseries.py index 5cb9189..be91506 100644 --- a/scripts/08_combined_timeseries.py +++ b/scripts/08_combined_timeseries.py @@ -50,7 +50,7 @@ PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT / "src")) from crq.ingest.nmdb import load_station, resample_daily -from crq.ingest.usgs import load_usgs +from crq.ingest.usgs import load_usgs, seismic_energy_per_bin from crq.stats.surrogates_gpu import ( surrogate_xcorr_test_gpu, gpu_available, @@ -141,16 +141,13 @@ def load_full_window( cr_bin = _bin_series(global_daily, t0, bin_days).reindex(ref_index) - # Seismic + # Seismic: log10 of summed seismic energy (E = 10^(1.5·Mw+4.8) J) events = load_usgs(start_year, end_year, usgs_dir) - events = events.loc[study_start:study_end] - events = events[events["mag"] >= min_mag] - daily_mw = events["mag"].resample("1D").sum() - daily_mw = daily_mw.reindex( - pd.date_range(study_start, study_end, freq="D"), fill_value=0.0 + seismic_bin = seismic_energy_per_bin( + events, study_start, study_end, bin_days, t0, min_mag=min_mag, ) - seismic_bin = _bin_series(daily_mw, t0, bin_days, agg="sum").fillna(0.0) - seismic_bin = seismic_bin.reindex(ref_index, fill_value=0.0) + floor_val = float(seismic_bin.min()) + seismic_bin = seismic_bin.reindex(ref_index, fill_value=floor_val) common = cr_bin.index.intersection(seismic_bin.index) cr_bin = cr_bin.reindex(common) @@ -361,16 +358,13 @@ def roster_analysis( t0 = pd.Timestamp(oos_start) ref_idx = _ref_index(oos_start, oos_end, bin_days) - # Seismic series (same for all rosters) + # Seismic series (log10 summed energy, same for all rosters) events = load_usgs(start_year, end_year, usgs_dir) - events = events.loc[oos_start:oos_end] - events = events[events["mag"] >= 4.0] - daily_mw = events["mag"].resample("1D").sum() - daily_mw = daily_mw.reindex( - pd.date_range(oos_start, oos_end, freq="D"), fill_value=0.0 + sei_bin = seismic_energy_per_bin( + events, oos_start, oos_end, bin_days, t0, min_mag=4.0, ) - sei_bin = _bin_series(daily_mw, t0, bin_days, agg="sum").fillna(0.0) - sei_bin = sei_bin.reindex(ref_idx, fill_value=0.0) + floor_val = float(sei_bin.min()) + sei_bin = sei_bin.reindex(ref_idx, fill_value=floor_val) for label, roster in [("A", rosters["A"]), ("B_oos", rosters["B_oos"]), diff --git a/scripts/10_robustness.py b/scripts/10_robustness.py new file mode 100644 index 0000000..e5bab3a --- /dev/null +++ b/scripts/10_robustness.py @@ -0,0 +1,555 @@ +#!/usr/bin/env python3 +""" +scripts/10_robustness.py +~~~~~~~~~~~~~~~~~~~~~~~~~ +Robustness checks for the CR–seismic correlation analysis. + +2b HP filter λ derivation and detrend comparison + • Prints the exact λ_p = 1600·(365/p)^4 formula + • 4-panel cross-correlation figure: raw / HP / Butterworth highpass + (2-yr cutoff) / 12-month rolling-mean subtraction + +2c Effective-N comparison + • Bretherton 1999 (full sum over lags) + • Bartlett 1946 first-order (current implementation) + • Monte Carlo via 1 000-replicate phase randomisation + • Supplementary table written to results/neff_comparison.json + +2d Magnitude threshold sensitivity + • Cross-correlation r(τ) for M≥4.5, M≥5.0, M≥6.0 + • 3-panel figure; flags if peak lags or signs diverge across thresholds + +All figures saved to results/figs/. Runs in ~2 min on CPU (no GPU needed). +""" + +from __future__ import annotations + +import json +import logging +import sys +import warnings +from pathlib import Path + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import scipy.signal +import scipy.stats +import yaml + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT / "src")) + +from crq.ingest.nmdb import load_station, resample_daily +from crq.ingest.usgs import load_usgs, seismic_energy_per_bin +from crq.stats.surrogates import n_eff_bretherton, phase_randomise + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", + datefmt="%H:%M:%S", +) +log = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- +BIN_DAYS = 5 +STUDY_START = "1976-01-01" +STUDY_END = "2019-12-31" +LAG_RANGE = 1000 # ±1000 days +MIN_STATIONS = 3 +COV_THRESH = 0.60 +SEED = 42 +OUT_DIR = ROOT / "results" +FIG_DIR = ROOT / "results" / "figs" +NMDB_DIR = ROOT / "data" / "raw" / "nmdb" +USGS_DIR = ROOT / "data" / "raw" / "usgs" +CFG_FILE = ROOT / "config" / "stations.yaml" + + +# --------------------------------------------------------------------------- +# Shared data loading +# --------------------------------------------------------------------------- + +def _station_names() -> list[str]: + with open(CFG_FILE) as fh: + return list(yaml.safe_load(fh)["stations"].keys()) + + +def _load_cr(study_start: str, study_end: str) -> pd.Series: + """Global CR index: station-normalised mean over 5-day bins.""" + t0 = pd.Timestamp(study_start) + t1 = pd.Timestamp(study_end) + start_yr, end_yr = t0.year, t1.year + + norm_cols: dict[str, pd.Series] = {} + for stn in _station_names(): + hourly = load_station(stn, start_yr, end_yr, NMDB_DIR) + if hourly.empty: + continue + daily = resample_daily(hourly, stn, coverage_threshold=COV_THRESH)[stn] + daily = daily.loc[study_start:study_end] + n_valid = daily.notna().sum() + if n_valid < 30: + continue + mu = daily.mean() + if not np.isfinite(mu) or mu <= 0: + continue + norm_cols[stn] = daily / mu + + if not norm_cols: + raise RuntimeError("No NMDB stations loaded.") + + mat = pd.DataFrame(norm_cols) + n_valid = mat.notna().sum(axis=1) + global_daily = mat.mean(axis=1) + global_daily[n_valid < MIN_STATIONS] = np.nan + + # 5-day bins + days = (global_daily.index - t0).days + bin_num = days // BIN_DAYS + bin_dates = t0 + pd.to_timedelta(bin_num * BIN_DAYS, unit="D") + cr = global_daily.groupby(bin_dates).mean() + cr.name = "cr_index" + log.info("CR loaded: %d bins, %d stations", len(cr), len(norm_cols)) + return cr + + +def _load_seismic( + study_start: str, + study_end: str, + min_mag: float = 4.5, + ref_index: "pd.DatetimeIndex | None" = None, +) -> pd.Series: + """log10(summed seismic energy) per 5-day bin.""" + t0 = pd.Timestamp(study_start) + events = load_usgs(t0.year, pd.Timestamp(study_end).year, USGS_DIR) + sei = seismic_energy_per_bin( + events, study_start, study_end, BIN_DAYS, t0, min_mag=min_mag, + ) + if ref_index is not None: + floor = float(sei.min()) + sei = sei.reindex(ref_index, fill_value=floor) + log.info("Seismic loaded (M≥%.1f): %d bins, range [%.1f, %.1f]", + min_mag, len(sei), float(sei.min()), float(sei.max())) + return sei + + +# --------------------------------------------------------------------------- +# Cross-correlation utility +# --------------------------------------------------------------------------- + +def xcorr(x: np.ndarray, y: np.ndarray, lag_bins: np.ndarray) -> np.ndarray: + """Pearson r(τ) for each lag in *lag_bins* (bin units, τ>0 = x leads y).""" + N = len(x) + rs = np.full(len(lag_bins), np.nan) + for i, lag in enumerate(lag_bins): + if lag >= 0: + xa, ya = x[:N - lag], y[lag:] + else: + absl = -lag + xa, ya = x[absl:], y[:N - absl] + ok = np.isfinite(xa) & np.isfinite(ya) + n = ok.sum() + if n >= 10: + r, _ = scipy.stats.pearsonr(xa[ok], ya[ok]) + rs[i] = r + return rs + + +# --------------------------------------------------------------------------- +# Detrending helpers +# --------------------------------------------------------------------------- + +def _hp_detrend(arr: np.ndarray, lam: float) -> np.ndarray: + """Return HP-filter residual.""" + from crq.preprocess.detrend import hp_filter_detrend + return hp_filter_detrend(arr, lamb=lam) + + +def _butterworth_highpass(arr: np.ndarray, cutoff_years: float = 2.0) -> np.ndarray: + """ + 3rd-order Butterworth highpass with cutoff at *cutoff_years* years. + Removes variability with periods longer than cutoff; keeps shorter-period signal. + """ + nyq = 1.0 / (2 * BIN_DAYS) # Nyquist in cycles/day + cutoff_cy_per_day = 1.0 / (cutoff_years * 365.25) + Wn = cutoff_cy_per_day / nyq # normalised frequency + Wn = float(np.clip(Wn, 1e-6, 0.999)) + sos = scipy.signal.butter(3, Wn, btype="high", output="sos") + clean = np.where(np.isfinite(arr), arr, np.nanmean(arr)) + filtered = scipy.signal.sosfiltfilt(sos, clean) + # Restore NaN positions + filtered[~np.isfinite(arr)] = np.nan + return filtered + + +def _rolling_mean_subtract(arr: np.ndarray, window_bins: int = 73) -> np.ndarray: + """Subtract a centred rolling mean (73 bins ≈ 365 days at 5-day bins).""" + s = pd.Series(arr) + trend = s.rolling(window=window_bins, center=True, min_periods=window_bins // 2).mean() + return (s - trend).to_numpy(dtype=float) + + +# --------------------------------------------------------------------------- +# 2b: HP λ derivation + detrend robustness +# --------------------------------------------------------------------------- + +def _hp_lambda_derivation() -> float: + """Print derivation and return λ_5.""" + lam_annual = 1600.0 + p = BIN_DAYS + lam_p = lam_annual * (365.0 / p) ** 4 + log.info("=== HP λ derivation ===") + log.info(" λ_annual = 1600 (Ravn & Uhlig 2002 standard for annual data)") + log.info(" For bin period p = %d days:", p) + log.info(" λ_p = 1600 × (365/p)^4 = 1600 × (%.1f)^4 = %.4e", 365.0 / p, lam_p) + log.info(" → λ_5 ≈ %.2e (used throughout this analysis)", lam_p) + return lam_p + + +def run_2b(cr: pd.Series, sei: pd.Series) -> None: + lam = _hp_lambda_derivation() + + lag_bins = np.arange(-LAG_RANGE // BIN_DAYS, LAG_RANGE // BIN_DAYS + 1) + lags_days = lag_bins * BIN_DAYS + + idx = cr.index.intersection(sei.index) + cr_a = cr.reindex(idx).to_numpy(float) + sei_a = sei.reindex(idx).to_numpy(float) + + # Four detrending variants + variants = [ + ("Raw (no detrending)", cr_a, sei_a), + ("HP filter λ=1.29×10⁵", _hp_detrend(cr_a, lam), _hp_detrend(sei_a, lam)), + ("Butterworth highpass (2-yr)", _butterworth_highpass(cr_a), _butterworth_highpass(sei_a)), + ("12-month rolling mean removal", _rolling_mean_subtract(cr_a), _rolling_mean_subtract(sei_a)), + ] + + colors = ["#555555", "#1b7837", "#2166ac", "#d6604d"] + fig, axes = plt.subplots(2, 2, figsize=(14, 9), sharex=True, sharey=False) + fig.suptitle( + "Detrending robustness: CR–seismic cross-correlation r(τ)\n" + "In-sample window 1976–2019, corrected seismic metric log₁₀(Σ E [J])", + fontsize=11, fontweight="bold", + ) + + for ax, (label, cr_v, sei_v), color in zip(axes.flat, variants, colors): + r_vals = xcorr(cr_v, sei_v, lag_bins) + + ax.axhline(0, color="k", lw=0.6) + ax.axvline(15, color="crimson", ls="--", lw=0.9, alpha=0.7, label="τ=+15 d") + ax.axvline(0, color="k", lw=0.5, alpha=0.3) + ax.plot(lags_days, r_vals, color=color, lw=1.3) + + # Annotate peak and τ=+15d + valid = np.isfinite(r_vals) + if valid.any(): + pk_i = np.nanargmax(np.abs(r_vals)) + pk_r = r_vals[pk_i] + pk_lg = lags_days[pk_i] + r15 = r_vals[np.searchsorted(lag_bins, 3)] # lag+15d = bin 3 + ax.scatter([pk_lg], [pk_r], color=color, s=40, zorder=5) + ax.text(0.02, 0.97, + f"peak r={pk_r:+.3f} @ τ={pk_lg:+d}d\nr(+15d)={r15:+.3f}", + transform=ax.transAxes, fontsize=8, va="top", + bbox=dict(boxstyle="round,pad=0.25", fc="white", alpha=0.85)) + + ax.set_title(label, fontsize=9, fontweight="bold") + ax.grid(True, alpha=0.25) + ax.set_xlim(lags_days[0], lags_days[-1]) + + for ax in axes[1]: + ax.set_xlabel("Lag τ (days) [τ>0 = CR leads seismic]") + for ax in axes[:, 0]: + ax.set_ylabel("Pearson r") + + fig.tight_layout() + path = FIG_DIR / "detrend_robustness.png" + fig.savefig(path, dpi=150, bbox_inches="tight") + plt.close(fig) + log.info("2b figure saved: %s", path) + + +# --------------------------------------------------------------------------- +# 2c: Neff comparison +# --------------------------------------------------------------------------- + +def _n_eff_bretherton_full(x: np.ndarray, y: np.ndarray, K: int = 200) -> float: + """ + Full Bretherton 1999 Neff summing over lags 1…K: + Neff = N / (1 + 2 Σ_{k=1}^{K} r_xx(k) r_yy(k)) + """ + x = x[np.isfinite(x)]; y = y[np.isfinite(y)] + n = min(len(x), len(y)) + if n < 10: + return float(n) + x = x[:n]; y = y[:n] + xc = x - x.mean(); yc = y - y.mean() + var_x = np.dot(xc, xc); var_y = np.dot(yc, yc) + if var_x < 1e-15 or var_y < 1e-15: + return float(n) + K_use = min(K, n - 2) + s = 0.0 + for k in range(1, K_use + 1): + rxx = np.dot(xc[:-k], xc[k:]) / var_x + ryy = np.dot(yc[:-k], yc[k:]) / var_y + s += rxx * ryy + denom = 1.0 + 2.0 * s + return float(np.clip(n / denom, 3.0, n)) + + +def _n_eff_bartlett(x: np.ndarray, y: np.ndarray) -> float: + """ + Bartlett 1946 first-order estimate: + Neff = N · (1 − ρ₁ₓ ρ₁ᵧ) / (1 + ρ₁ₓ ρ₁ᵧ) + """ + # This is what the current n_eff_bretherton() in surrogates.py computes. + def r1(v): + v = v[np.isfinite(v)] + if len(v) < 4: + return 0.0 + vc = v - v.mean() + var = np.dot(vc, vc) + if var < 1e-15: + return 0.0 + return float(np.clip(np.dot(vc[:-1], vc[1:]) / var, -0.9999, 0.9999)) + + n = min(len(x), len(y)) + r1x, r1y = r1(x), r1(y) + denom = 1.0 + r1x * r1y + if abs(denom) < 1e-12: + return 3.0 + return float(np.clip(n * (1.0 - r1x * r1y) / denom, 3.0, n)) + + +def _n_eff_monte_carlo( + x: np.ndarray, + y: np.ndarray, + lag_bin: int = 3, + n_reps: int = 1000, + seed: int = SEED, +) -> float: + """ + Monte Carlo Neff via phase randomisation of y. + SE_r = std of r(lag_bin) under the null ≈ 1/sqrt(Neff) + → Neff_mc = 1 / SE_r² + """ + rng = np.random.default_rng(seed) + N = len(x) + ok = np.isfinite(x) & np.isfinite(y) + x_c = x[ok]; y_c = y[ok] + n = len(x_c) + + rs = np.empty(n_reps) + for i in range(n_reps): + y_surr = phase_randomise(y_c, seed=rng) + # Align at given lag_bin (positive = x leads y) + if lag_bin >= 0: + xa = x_c[:n - lag_bin]; ya = y_surr[lag_bin:] + else: + xa = x_c[-lag_bin:]; ya = y_surr[:n + lag_bin] + if len(xa) < 4: + rs[i] = 0.0 + continue + rs[i], _ = scipy.stats.pearsonr(xa, ya) + + se = float(np.std(rs, ddof=1)) + if se < 1e-12: + return float(n) + return float(np.clip(1.0 / se**2, 3.0, n)) + + +def run_2c(cr: pd.Series, sei: pd.Series) -> dict: + log.info("=== 2c: Neff comparison ===") + idx = cr.index.intersection(sei.index) + x = cr.reindex(idx).to_numpy(float) + y = sei.reindex(idx).to_numpy(float) + N = (~np.isnan(x) & ~np.isnan(y)).sum() + + r15, _ = scipy.stats.pearsonr( + x[np.isfinite(x) & np.isfinite(y)], + y[np.isfinite(x) & np.isfinite(y)], + ) + # Actually compute at lag +15d (bin 3) + ok = np.isfinite(x) & np.isfinite(y) + lag = 3 + xa, ya = x[:len(x) - lag], y[lag:] + ok2 = np.isfinite(xa) & np.isfinite(ya) + r15, _ = scipy.stats.pearsonr(xa[ok2], ya[ok2]) + + neff_bartlett = _n_eff_bartlett(x, y) + neff_breth_full = _n_eff_bretherton_full(x, y) + log.info("Running Monte Carlo Neff (1000 surrogates) …") + neff_mc = _n_eff_monte_carlo(x, y, lag_bin=3, n_reps=1000) + + def _ci(r, n): + if n < 4 or not np.isfinite(r): + return np.nan, np.nan + se = 1.0 / np.sqrt(n - 3) + z = np.arctanh(r) + return float(np.tanh(z - 1.96 * se)), float(np.tanh(z + 1.96 * se)) + + results = { + "N_raw": int(N), + "r_at_tau_plus15d": round(float(r15), 5), + "methods": { + "Bartlett_1946_first_order": { + "Neff": round(neff_bartlett, 1), + "CI_95_r": _ci(r15, neff_bartlett), + "note": "N·(1−ρ₁ₓρ₁ᵧ)/(1+ρ₁ₓρ₁ᵧ) [current implementation]", + }, + "Bretherton_1999_full_sum": { + "Neff": round(neff_breth_full, 1), + "CI_95_r": _ci(r15, neff_breth_full), + "note": "N / (1 + 2Σ_k ρ_xx(k)ρ_yy(k)) K=200 lags", + }, + "Monte_Carlo_phase_surrogates": { + "Neff": round(neff_mc, 1), + "CI_95_r": _ci(r15, neff_mc), + "note": "1/Var(r_null) from 1000 phase-randomised y series", + }, + }, + } + log.info(" N_raw=%d r(+15d)=%.4f", N, r15) + for k, v in results["methods"].items(): + ci = v["CI_95_r"] + log.info(" %-40s Neff=%6.0f 95%%CI=[%.3f, %.3f]", k, v["Neff"], ci[0], ci[1]) + + out = OUT_DIR / "neff_comparison.json" + with open(out, "w") as fh: + json.dump(results, fh, indent=2) + log.info("2c results saved: %s", out) + return results + + +# --------------------------------------------------------------------------- +# 2d: Magnitude threshold sensitivity +# --------------------------------------------------------------------------- + +def run_2d(cr: pd.Series) -> None: + log.info("=== 2d: Magnitude threshold sensitivity ===") + thresholds = [4.5, 5.0, 6.0] + lag_bins = np.arange(-LAG_RANGE // BIN_DAYS, LAG_RANGE // BIN_DAYS + 1) + lags_days = lag_bins * BIN_DAYS + + colors = ["#1b7837", "#2166ac", "#d6604d"] + fig, axes = plt.subplots(1, 3, figsize=(16, 4.5), sharey=True) + fig.suptitle( + "Magnitude threshold sensitivity: CR–seismic cross-correlation r(τ)\n" + "In-sample 1976–2019, no detrending, log₁₀(Σ E [J]) metric", + fontsize=10, fontweight="bold", + ) + + summary = {} + for ax, Mmin, color in zip(axes, thresholds, colors): + log.info(" Loading M≥%.1f seismic series …", Mmin) + sei = _load_seismic(STUDY_START, STUDY_END, min_mag=Mmin, ref_index=cr.index) + idx = cr.index.intersection(sei.index) + cr_a = cr.reindex(idx).to_numpy(float) + sei_a = sei.reindex(idx).to_numpy(float) + + n_events_approx = sei_a[np.isfinite(sei_a)].size + r_vals = xcorr(cr_a, sei_a, lag_bins) + + ax.axhline(0, color="k", lw=0.6) + ax.axvline(15, color="crimson", ls="--", lw=0.9, alpha=0.7) + ax.axvline(0, color="k", lw=0.5, alpha=0.3) + ax.plot(lags_days, r_vals, color=color, lw=1.3) + + valid = np.isfinite(r_vals) + r15 = float(r_vals[np.searchsorted(lag_bins, 3)]) # τ=+15d bin + pk_i = int(np.nanargmax(np.abs(r_vals))) + pk_r = float(r_vals[pk_i]) + pk_lg = int(lags_days[pk_i]) + + # Flag empty-bin fraction (relevant at M≥6.0) + n_nan = int(np.isnan(sei_a).sum()) + frac_empty = 100.0 * n_nan / len(sei_a) + + ax.scatter([pk_lg], [pk_r], color=color, s=40, zorder=5) + note = f"⚠ {frac_empty:.0f}% empty bins" if frac_empty > 5 else "" + ax.text(0.02, 0.97, + f"peak r={pk_r:+.3f} @ τ={pk_lg:+d}d\nr(+15d)={r15:+.3f}\n{note}", + transform=ax.transAxes, fontsize=8, va="top", + bbox=dict(boxstyle="round,pad=0.25", fc="white", alpha=0.85)) + + ax.set_title(f"M ≥ {Mmin:.1f}", fontsize=10, fontweight="bold") + ax.set_xlabel("Lag τ (days) [τ>0 = CR leads seismic]") + ax.grid(True, alpha=0.25) + ax.set_xlim(lags_days[0], lags_days[-1]) + + summary[f"M_ge_{Mmin}"] = { + "r_at_tau_plus15d": round(r15, 4), + "peak_r": round(pk_r, 4), + "peak_lag_days": pk_lg, + "frac_empty_bins_pct": round(frac_empty, 1), + } + log.info(" M≥%.1f: r(+15d)=%.4f peak r=%.4f @ %+d d empty=%.1f%%", + Mmin, r15, pk_r, pk_lg, frac_empty) + + axes[0].set_ylabel("Pearson r") + fig.tight_layout() + path = FIG_DIR / "magnitude_threshold.png" + fig.savefig(path, dpi=150, bbox_inches="tight") + plt.close(fig) + log.info("2d figure saved: %s", path) + + # Flag divergence + r15_vals = [summary[k]["r_at_tau_plus15d"] for k in summary] + if max(r15_vals) - min(r15_vals) > 0.05: + log.warning( + "r(+15d) shifts substantially across thresholds (range %.3f)." + " May indicate catalogue incompleteness or aftershock contamination " + "at lower magnitudes.", max(r15_vals) - min(r15_vals), + ) + + out = OUT_DIR / "magnitude_sensitivity.json" + with open(out, "w") as fh: + json.dump(summary, fh, indent=2) + log.info("2d results saved: %s", out) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + FIG_DIR.mkdir(parents=True, exist_ok=True) + OUT_DIR.mkdir(parents=True, exist_ok=True) + + log.info("Loading CR index …") + cr = _load_cr(STUDY_START, STUDY_END) + + log.info("Loading seismic metric (M≥4.5) …") + sei = _load_seismic(STUDY_START, STUDY_END, min_mag=4.5, ref_index=cr.index) + + log.info("Running 2b: HP λ derivation + detrend robustness …") + run_2b(cr, sei) + + log.info("Running 2c: Neff comparison …") + neff_results = run_2c(cr, sei) + + log.info("Running 2d: Magnitude threshold sensitivity …") + run_2d(cr) + + log.info("All robustness checks complete.") + + # Print Neff summary table + print("\n" + "=" * 70) + print(" Neff COMPARISON (in-sample 1976–2019, raw series, τ=+15 d)") + print("=" * 70) + print(f" N_raw = {neff_results['N_raw']} r(+15d) = {neff_results['r_at_tau_plus15d']:.4f}") + print() + for method, vals in neff_results["methods"].items(): + ci = vals["CI_95_r"] + ci_str = f"[{ci[0]:.3f}, {ci[1]:.3f}]" if all(np.isfinite(c) for c in ci) else "N/A" + print(f" {method:<42} Neff={vals['Neff']:>6.0f} 95%CI_r={ci_str}") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/src/crq/ingest/usgs.py b/src/crq/ingest/usgs.py index d6080b4..cd549ac 100644 --- a/src/crq/ingest/usgs.py +++ b/src/crq/ingest/usgs.py @@ -140,24 +140,85 @@ def load_usgs( # Aggregation # --------------------------------------------------------------------------- +def seismic_energy_per_bin( + events: pd.DataFrame, + study_start: str, + study_end: str, + bin_days: int, + t0: "pd.Timestamp | None" = None, + *, + min_mag: float = 4.5, +) -> pd.Series: + """ + Physically correct seismic energy metric per *bin_days*-day bin. + + Each earthquake contributes its radiated energy + E_i = 10^(1.5 · Mw_i + 4.8) [joules] + (Kanamori 1977). Values are summed over all events in the bin and the + result is returned as log10(E_bin_sum). Empty bins → NaN. + + Parameters + ---------- + events : DataFrame with DatetimeIndex and a "mag" column (from load_usgs). + study_start : ISO date string (inclusive). + study_end : ISO date string (inclusive). + bin_days : Bin width in days. + t0 : Anchor timestamp for floor-division binning; defaults to + pd.Timestamp(study_start). + min_mag : Minimum magnitude threshold. + + Returns + ------- + pd.Series with DatetimeIndex of bin start-dates and float64 values + (log10 of summed seismic energy in joules). + """ + if t0 is None: + t0 = pd.Timestamp(study_start) + + ev = events.loc[study_start:study_end] + ev = ev[ev["mag"] >= min_mag].copy() + + ev["energy"] = np.power(10.0, 1.5 * ev["mag"].values + 4.8) + + # Daily sum of energy + daily_e = ev["energy"].resample("1D").sum() + full_day_idx = pd.date_range(study_start, study_end, freq="D") + daily_e = daily_e.reindex(full_day_idx, fill_value=0.0) + + # Floor-division binning anchored at t0 + days_from_t0 = (daily_e.index - t0).days + bin_num = days_from_t0 // bin_days + bin_dates = t0 + pd.to_timedelta(bin_num * bin_days, unit="D") + bin_energy = daily_e.groupby(bin_dates).sum() + + # log10; empty bins → NaN + result = np.log10(bin_energy.where(bin_energy > 0, other=np.nan)) + result.name = "log10_seismic_energy_J" + return result + + def compute_daily_seismic( events: pd.DataFrame, interval: str = "1D", origin: str = "1960-01-01", ) -> pd.DataFrame: """ - Resample earthquake events to *interval* using the log-power average of - magnitude (same formula as the notebook's ``log_avg``). + .. deprecated:: + This function applied a dB-domain average to Mw values, which is + physically invalid (Mw is already logarithmic so 10·log10(mean(10^(Mw/10))) + does not represent energy). Use :func:`seismic_energy_per_bin` instead. - NaN days (no events) remain NaN — **not** zero. + Retained for backwards compatibility with existing callers only. """ mag = events["mag"].copy() - def _log_avg(s: pd.Series) -> float: + def _energy_log_mean(s: pd.Series) -> float: arr = s.dropna().values if arr.size == 0: return np.nan - return float(10.0 * np.log10(np.mean(np.power(10.0, arr / 10.0)))) + # Correct: sum seismic energy, return log10 + energies = np.power(10.0, 1.5 * arr + 4.8) + return float(np.log10(np.sum(energies))) - daily = mag.resample(interval).apply(_log_avg) + daily = mag.resample(interval).apply(_energy_log_mean) return daily.rename("mag").to_frame()