cosmicraysandearthquakes/scripts/10_robustness.py
root 798b0744b5 Fix seismic metric + robustness checks (issues 2a–2d)
2a — Replace physically invalid summed-Mw seismic metric with correct
     log10(ΣE) where E=10^(1.5·Mw+4.8) J (Kanamori 1977).  Updates
     usgs.py (new seismic_energy_per_bin()), scripts 02/07/08.
     Impact: r(+15d) raw 0.310→0.081, peak r 0.469→0.139; sinusoidal
     BF 27.5→0.75 (constant model now preferred).

2b — HP λ derivation: λ_5 = 1600×(365/5)^4 ≈ 4.54×10^10 (Ravn & Uhlig
     2002 rescaling); detrend robustness figure (HP/Butterworth/rolling
     mean), all show r(+15d)<0.04.

2c — Neff comparison: Bartlett 2923, Bretherton 769, Monte Carlo 594;
     MC 95% CI [-0.002, +0.158] straddles zero → raw r not significant
     under most conservative estimate.

2d — Magnitude threshold (M≥4.5/5.0/6.0): r(+15d) range 0.050–0.079
     (Δ=0.029 < 0.05), peak stable at τ=−525 d; no aftershock bias.

Paper (main.tex, refs.bib): update all numbers, add Kanamori/Bartlett/
Ravn–Uhlig refs, new sections for 2b–2d with figures and Neff table.
PDF recompiled (27 pages).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-24 14:49:54 +02:00

555 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
scripts/10_robustness.py
~~~~~~~~~~~~~~~~~~~~~~~~~
Robustness checks for the CRseismic correlation analysis.
2b HP filter λ derivation and detrend comparison
• Prints the exact λ_p = 1600·(365/p)^4 formula
• 4-panel cross-correlation figure: raw / HP / Butterworth highpass
(2-yr cutoff) / 12-month rolling-mean subtraction
2c Effective-N comparison
• Bretherton 1999 (full sum over lags)
• Bartlett 1946 first-order (current implementation)
• Monte Carlo via 1 000-replicate phase randomisation
• Supplementary table written to results/neff_comparison.json
2d Magnitude threshold sensitivity
• Cross-correlation r(τ) for M≥4.5, M≥5.0, M≥6.0
• 3-panel figure; flags if peak lags or signs diverge across thresholds
All figures saved to results/figs/. Runs in ~2 min on CPU (no GPU needed).
"""
from __future__ import annotations
import json
import logging
import sys
import warnings
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.signal
import scipy.stats
import yaml
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT / "src"))
from crq.ingest.nmdb import load_station, resample_daily
from crq.ingest.usgs import load_usgs, seismic_energy_per_bin
from crq.stats.surrogates import n_eff_bretherton, phase_randomise
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
BIN_DAYS = 5
STUDY_START = "1976-01-01"
STUDY_END = "2019-12-31"
LAG_RANGE = 1000 # ±1000 days
MIN_STATIONS = 3
COV_THRESH = 0.60
SEED = 42
OUT_DIR = ROOT / "results"
FIG_DIR = ROOT / "results" / "figs"
NMDB_DIR = ROOT / "data" / "raw" / "nmdb"
USGS_DIR = ROOT / "data" / "raw" / "usgs"
CFG_FILE = ROOT / "config" / "stations.yaml"
# ---------------------------------------------------------------------------
# Shared data loading
# ---------------------------------------------------------------------------
def _station_names() -> list[str]:
with open(CFG_FILE) as fh:
return list(yaml.safe_load(fh)["stations"].keys())
def _load_cr(study_start: str, study_end: str) -> pd.Series:
"""Global CR index: station-normalised mean over 5-day bins."""
t0 = pd.Timestamp(study_start)
t1 = pd.Timestamp(study_end)
start_yr, end_yr = t0.year, t1.year
norm_cols: dict[str, pd.Series] = {}
for stn in _station_names():
hourly = load_station(stn, start_yr, end_yr, NMDB_DIR)
if hourly.empty:
continue
daily = resample_daily(hourly, stn, coverage_threshold=COV_THRESH)[stn]
daily = daily.loc[study_start:study_end]
n_valid = daily.notna().sum()
if n_valid < 30:
continue
mu = daily.mean()
if not np.isfinite(mu) or mu <= 0:
continue
norm_cols[stn] = daily / mu
if not norm_cols:
raise RuntimeError("No NMDB stations loaded.")
mat = pd.DataFrame(norm_cols)
n_valid = mat.notna().sum(axis=1)
global_daily = mat.mean(axis=1)
global_daily[n_valid < MIN_STATIONS] = np.nan
# 5-day bins
days = (global_daily.index - t0).days
bin_num = days // BIN_DAYS
bin_dates = t0 + pd.to_timedelta(bin_num * BIN_DAYS, unit="D")
cr = global_daily.groupby(bin_dates).mean()
cr.name = "cr_index"
log.info("CR loaded: %d bins, %d stations", len(cr), len(norm_cols))
return cr
def _load_seismic(
study_start: str,
study_end: str,
min_mag: float = 4.5,
ref_index: "pd.DatetimeIndex | None" = None,
) -> pd.Series:
"""log10(summed seismic energy) per 5-day bin."""
t0 = pd.Timestamp(study_start)
events = load_usgs(t0.year, pd.Timestamp(study_end).year, USGS_DIR)
sei = seismic_energy_per_bin(
events, study_start, study_end, BIN_DAYS, t0, min_mag=min_mag,
)
if ref_index is not None:
floor = float(sei.min())
sei = sei.reindex(ref_index, fill_value=floor)
log.info("Seismic loaded (M≥%.1f): %d bins, range [%.1f, %.1f]",
min_mag, len(sei), float(sei.min()), float(sei.max()))
return sei
# ---------------------------------------------------------------------------
# Cross-correlation utility
# ---------------------------------------------------------------------------
def xcorr(x: np.ndarray, y: np.ndarray, lag_bins: np.ndarray) -> np.ndarray:
"""Pearson r(τ) for each lag in *lag_bins* (bin units, τ>0 = x leads y)."""
N = len(x)
rs = np.full(len(lag_bins), np.nan)
for i, lag in enumerate(lag_bins):
if lag >= 0:
xa, ya = x[:N - lag], y[lag:]
else:
absl = -lag
xa, ya = x[absl:], y[:N - absl]
ok = np.isfinite(xa) & np.isfinite(ya)
n = ok.sum()
if n >= 10:
r, _ = scipy.stats.pearsonr(xa[ok], ya[ok])
rs[i] = r
return rs
# ---------------------------------------------------------------------------
# Detrending helpers
# ---------------------------------------------------------------------------
def _hp_detrend(arr: np.ndarray, lam: float) -> np.ndarray:
"""Return HP-filter residual."""
from crq.preprocess.detrend import hp_filter_detrend
return hp_filter_detrend(arr, lamb=lam)
def _butterworth_highpass(arr: np.ndarray, cutoff_years: float = 2.0) -> np.ndarray:
"""
3rd-order Butterworth highpass with cutoff at *cutoff_years* years.
Removes variability with periods longer than cutoff; keeps shorter-period signal.
"""
nyq = 1.0 / (2 * BIN_DAYS) # Nyquist in cycles/day
cutoff_cy_per_day = 1.0 / (cutoff_years * 365.25)
Wn = cutoff_cy_per_day / nyq # normalised frequency
Wn = float(np.clip(Wn, 1e-6, 0.999))
sos = scipy.signal.butter(3, Wn, btype="high", output="sos")
clean = np.where(np.isfinite(arr), arr, np.nanmean(arr))
filtered = scipy.signal.sosfiltfilt(sos, clean)
# Restore NaN positions
filtered[~np.isfinite(arr)] = np.nan
return filtered
def _rolling_mean_subtract(arr: np.ndarray, window_bins: int = 73) -> np.ndarray:
"""Subtract a centred rolling mean (73 bins ≈ 365 days at 5-day bins)."""
s = pd.Series(arr)
trend = s.rolling(window=window_bins, center=True, min_periods=window_bins // 2).mean()
return (s - trend).to_numpy(dtype=float)
# ---------------------------------------------------------------------------
# 2b: HP λ derivation + detrend robustness
# ---------------------------------------------------------------------------
def _hp_lambda_derivation() -> float:
"""Print derivation and return λ_5."""
lam_annual = 1600.0
p = BIN_DAYS
lam_p = lam_annual * (365.0 / p) ** 4
log.info("=== HP λ derivation ===")
log.info(" λ_annual = 1600 (Ravn & Uhlig 2002 standard for annual data)")
log.info(" For bin period p = %d days:", p)
log.info(" λ_p = 1600 × (365/p)^4 = 1600 × (%.1f)^4 = %.4e", 365.0 / p, lam_p)
log.info(" → λ_5 ≈ %.2e (used throughout this analysis)", lam_p)
return lam_p
def run_2b(cr: pd.Series, sei: pd.Series) -> None:
lam = _hp_lambda_derivation()
lag_bins = np.arange(-LAG_RANGE // BIN_DAYS, LAG_RANGE // BIN_DAYS + 1)
lags_days = lag_bins * BIN_DAYS
idx = cr.index.intersection(sei.index)
cr_a = cr.reindex(idx).to_numpy(float)
sei_a = sei.reindex(idx).to_numpy(float)
# Four detrending variants
variants = [
("Raw (no detrending)", cr_a, sei_a),
("HP filter λ=1.29×10⁵", _hp_detrend(cr_a, lam), _hp_detrend(sei_a, lam)),
("Butterworth highpass (2-yr)", _butterworth_highpass(cr_a), _butterworth_highpass(sei_a)),
("12-month rolling mean removal", _rolling_mean_subtract(cr_a), _rolling_mean_subtract(sei_a)),
]
colors = ["#555555", "#1b7837", "#2166ac", "#d6604d"]
fig, axes = plt.subplots(2, 2, figsize=(14, 9), sharex=True, sharey=False)
fig.suptitle(
"Detrending robustness: CRseismic cross-correlation r(τ)\n"
"In-sample window 19762019, corrected seismic metric log₁₀(Σ E [J])",
fontsize=11, fontweight="bold",
)
for ax, (label, cr_v, sei_v), color in zip(axes.flat, variants, colors):
r_vals = xcorr(cr_v, sei_v, lag_bins)
ax.axhline(0, color="k", lw=0.6)
ax.axvline(15, color="crimson", ls="--", lw=0.9, alpha=0.7, label="τ=+15 d")
ax.axvline(0, color="k", lw=0.5, alpha=0.3)
ax.plot(lags_days, r_vals, color=color, lw=1.3)
# Annotate peak and τ=+15d
valid = np.isfinite(r_vals)
if valid.any():
pk_i = np.nanargmax(np.abs(r_vals))
pk_r = r_vals[pk_i]
pk_lg = lags_days[pk_i]
r15 = r_vals[np.searchsorted(lag_bins, 3)] # lag+15d = bin 3
ax.scatter([pk_lg], [pk_r], color=color, s=40, zorder=5)
ax.text(0.02, 0.97,
f"peak r={pk_r:+.3f} @ τ={pk_lg:+d}d\nr(+15d)={r15:+.3f}",
transform=ax.transAxes, fontsize=8, va="top",
bbox=dict(boxstyle="round,pad=0.25", fc="white", alpha=0.85))
ax.set_title(label, fontsize=9, fontweight="bold")
ax.grid(True, alpha=0.25)
ax.set_xlim(lags_days[0], lags_days[-1])
for ax in axes[1]:
ax.set_xlabel("Lag τ (days) [τ>0 = CR leads seismic]")
for ax in axes[:, 0]:
ax.set_ylabel("Pearson r")
fig.tight_layout()
path = FIG_DIR / "detrend_robustness.png"
fig.savefig(path, dpi=150, bbox_inches="tight")
plt.close(fig)
log.info("2b figure saved: %s", path)
# ---------------------------------------------------------------------------
# 2c: Neff comparison
# ---------------------------------------------------------------------------
def _n_eff_bretherton_full(x: np.ndarray, y: np.ndarray, K: int = 200) -> float:
"""
Full Bretherton 1999 Neff summing over lags 1…K:
Neff = N / (1 + 2 Σ_{k=1}^{K} r_xx(k) r_yy(k))
"""
x = x[np.isfinite(x)]; y = y[np.isfinite(y)]
n = min(len(x), len(y))
if n < 10:
return float(n)
x = x[:n]; y = y[:n]
xc = x - x.mean(); yc = y - y.mean()
var_x = np.dot(xc, xc); var_y = np.dot(yc, yc)
if var_x < 1e-15 or var_y < 1e-15:
return float(n)
K_use = min(K, n - 2)
s = 0.0
for k in range(1, K_use + 1):
rxx = np.dot(xc[:-k], xc[k:]) / var_x
ryy = np.dot(yc[:-k], yc[k:]) / var_y
s += rxx * ryy
denom = 1.0 + 2.0 * s
return float(np.clip(n / denom, 3.0, n))
def _n_eff_bartlett(x: np.ndarray, y: np.ndarray) -> float:
"""
Bartlett 1946 first-order estimate:
Neff = N · (1 ρ₁ₓ ρ₁ᵧ) / (1 + ρ₁ₓ ρ₁ᵧ)
"""
# This is what the current n_eff_bretherton() in surrogates.py computes.
def r1(v):
v = v[np.isfinite(v)]
if len(v) < 4:
return 0.0
vc = v - v.mean()
var = np.dot(vc, vc)
if var < 1e-15:
return 0.0
return float(np.clip(np.dot(vc[:-1], vc[1:]) / var, -0.9999, 0.9999))
n = min(len(x), len(y))
r1x, r1y = r1(x), r1(y)
denom = 1.0 + r1x * r1y
if abs(denom) < 1e-12:
return 3.0
return float(np.clip(n * (1.0 - r1x * r1y) / denom, 3.0, n))
def _n_eff_monte_carlo(
x: np.ndarray,
y: np.ndarray,
lag_bin: int = 3,
n_reps: int = 1000,
seed: int = SEED,
) -> float:
"""
Monte Carlo Neff via phase randomisation of y.
SE_r = std of r(lag_bin) under the null ≈ 1/sqrt(Neff)
→ Neff_mc = 1 / SE_r²
"""
rng = np.random.default_rng(seed)
N = len(x)
ok = np.isfinite(x) & np.isfinite(y)
x_c = x[ok]; y_c = y[ok]
n = len(x_c)
rs = np.empty(n_reps)
for i in range(n_reps):
y_surr = phase_randomise(y_c, seed=rng)
# Align at given lag_bin (positive = x leads y)
if lag_bin >= 0:
xa = x_c[:n - lag_bin]; ya = y_surr[lag_bin:]
else:
xa = x_c[-lag_bin:]; ya = y_surr[:n + lag_bin]
if len(xa) < 4:
rs[i] = 0.0
continue
rs[i], _ = scipy.stats.pearsonr(xa, ya)
se = float(np.std(rs, ddof=1))
if se < 1e-12:
return float(n)
return float(np.clip(1.0 / se**2, 3.0, n))
def run_2c(cr: pd.Series, sei: pd.Series) -> dict:
log.info("=== 2c: Neff comparison ===")
idx = cr.index.intersection(sei.index)
x = cr.reindex(idx).to_numpy(float)
y = sei.reindex(idx).to_numpy(float)
N = (~np.isnan(x) & ~np.isnan(y)).sum()
r15, _ = scipy.stats.pearsonr(
x[np.isfinite(x) & np.isfinite(y)],
y[np.isfinite(x) & np.isfinite(y)],
)
# Actually compute at lag +15d (bin 3)
ok = np.isfinite(x) & np.isfinite(y)
lag = 3
xa, ya = x[:len(x) - lag], y[lag:]
ok2 = np.isfinite(xa) & np.isfinite(ya)
r15, _ = scipy.stats.pearsonr(xa[ok2], ya[ok2])
neff_bartlett = _n_eff_bartlett(x, y)
neff_breth_full = _n_eff_bretherton_full(x, y)
log.info("Running Monte Carlo Neff (1000 surrogates) …")
neff_mc = _n_eff_monte_carlo(x, y, lag_bin=3, n_reps=1000)
def _ci(r, n):
if n < 4 or not np.isfinite(r):
return np.nan, np.nan
se = 1.0 / np.sqrt(n - 3)
z = np.arctanh(r)
return float(np.tanh(z - 1.96 * se)), float(np.tanh(z + 1.96 * se))
results = {
"N_raw": int(N),
"r_at_tau_plus15d": round(float(r15), 5),
"methods": {
"Bartlett_1946_first_order": {
"Neff": round(neff_bartlett, 1),
"CI_95_r": _ci(r15, neff_bartlett),
"note": "N·(1ρ₁ₓρ₁ᵧ)/(1+ρ₁ₓρ₁ᵧ) [current implementation]",
},
"Bretherton_1999_full_sum": {
"Neff": round(neff_breth_full, 1),
"CI_95_r": _ci(r15, neff_breth_full),
"note": "N / (1 + 2Σ_k ρ_xx(k)ρ_yy(k)) K=200 lags",
},
"Monte_Carlo_phase_surrogates": {
"Neff": round(neff_mc, 1),
"CI_95_r": _ci(r15, neff_mc),
"note": "1/Var(r_null) from 1000 phase-randomised y series",
},
},
}
log.info(" N_raw=%d r(+15d)=%.4f", N, r15)
for k, v in results["methods"].items():
ci = v["CI_95_r"]
log.info(" %-40s Neff=%6.0f 95%%CI=[%.3f, %.3f]", k, v["Neff"], ci[0], ci[1])
out = OUT_DIR / "neff_comparison.json"
with open(out, "w") as fh:
json.dump(results, fh, indent=2)
log.info("2c results saved: %s", out)
return results
# ---------------------------------------------------------------------------
# 2d: Magnitude threshold sensitivity
# ---------------------------------------------------------------------------
def run_2d(cr: pd.Series) -> None:
log.info("=== 2d: Magnitude threshold sensitivity ===")
thresholds = [4.5, 5.0, 6.0]
lag_bins = np.arange(-LAG_RANGE // BIN_DAYS, LAG_RANGE // BIN_DAYS + 1)
lags_days = lag_bins * BIN_DAYS
colors = ["#1b7837", "#2166ac", "#d6604d"]
fig, axes = plt.subplots(1, 3, figsize=(16, 4.5), sharey=True)
fig.suptitle(
"Magnitude threshold sensitivity: CRseismic cross-correlation r(τ)\n"
"In-sample 19762019, no detrending, log₁₀(Σ E [J]) metric",
fontsize=10, fontweight="bold",
)
summary = {}
for ax, Mmin, color in zip(axes, thresholds, colors):
log.info(" Loading M≥%.1f seismic series …", Mmin)
sei = _load_seismic(STUDY_START, STUDY_END, min_mag=Mmin, ref_index=cr.index)
idx = cr.index.intersection(sei.index)
cr_a = cr.reindex(idx).to_numpy(float)
sei_a = sei.reindex(idx).to_numpy(float)
n_events_approx = sei_a[np.isfinite(sei_a)].size
r_vals = xcorr(cr_a, sei_a, lag_bins)
ax.axhline(0, color="k", lw=0.6)
ax.axvline(15, color="crimson", ls="--", lw=0.9, alpha=0.7)
ax.axvline(0, color="k", lw=0.5, alpha=0.3)
ax.plot(lags_days, r_vals, color=color, lw=1.3)
valid = np.isfinite(r_vals)
r15 = float(r_vals[np.searchsorted(lag_bins, 3)]) # τ=+15d bin
pk_i = int(np.nanargmax(np.abs(r_vals)))
pk_r = float(r_vals[pk_i])
pk_lg = int(lags_days[pk_i])
# Flag empty-bin fraction (relevant at M≥6.0)
n_nan = int(np.isnan(sei_a).sum())
frac_empty = 100.0 * n_nan / len(sei_a)
ax.scatter([pk_lg], [pk_r], color=color, s=40, zorder=5)
note = f"{frac_empty:.0f}% empty bins" if frac_empty > 5 else ""
ax.text(0.02, 0.97,
f"peak r={pk_r:+.3f} @ τ={pk_lg:+d}d\nr(+15d)={r15:+.3f}\n{note}",
transform=ax.transAxes, fontsize=8, va="top",
bbox=dict(boxstyle="round,pad=0.25", fc="white", alpha=0.85))
ax.set_title(f"M ≥ {Mmin:.1f}", fontsize=10, fontweight="bold")
ax.set_xlabel("Lag τ (days) [τ>0 = CR leads seismic]")
ax.grid(True, alpha=0.25)
ax.set_xlim(lags_days[0], lags_days[-1])
summary[f"M_ge_{Mmin}"] = {
"r_at_tau_plus15d": round(r15, 4),
"peak_r": round(pk_r, 4),
"peak_lag_days": pk_lg,
"frac_empty_bins_pct": round(frac_empty, 1),
}
log.info(" M≥%.1f: r(+15d)=%.4f peak r=%.4f @ %+d d empty=%.1f%%",
Mmin, r15, pk_r, pk_lg, frac_empty)
axes[0].set_ylabel("Pearson r")
fig.tight_layout()
path = FIG_DIR / "magnitude_threshold.png"
fig.savefig(path, dpi=150, bbox_inches="tight")
plt.close(fig)
log.info("2d figure saved: %s", path)
# Flag divergence
r15_vals = [summary[k]["r_at_tau_plus15d"] for k in summary]
if max(r15_vals) - min(r15_vals) > 0.05:
log.warning(
"r(+15d) shifts substantially across thresholds (range %.3f)."
" May indicate catalogue incompleteness or aftershock contamination "
"at lower magnitudes.", max(r15_vals) - min(r15_vals),
)
out = OUT_DIR / "magnitude_sensitivity.json"
with open(out, "w") as fh:
json.dump(summary, fh, indent=2)
log.info("2d results saved: %s", out)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
FIG_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)
log.info("Loading CR index …")
cr = _load_cr(STUDY_START, STUDY_END)
log.info("Loading seismic metric (M≥4.5) …")
sei = _load_seismic(STUDY_START, STUDY_END, min_mag=4.5, ref_index=cr.index)
log.info("Running 2b: HP λ derivation + detrend robustness …")
run_2b(cr, sei)
log.info("Running 2c: Neff comparison …")
neff_results = run_2c(cr, sei)
log.info("Running 2d: Magnitude threshold sensitivity …")
run_2d(cr)
log.info("All robustness checks complete.")
# Print Neff summary table
print("\n" + "=" * 70)
print(" Neff COMPARISON (in-sample 19762019, raw series, τ=+15 d)")
print("=" * 70)
print(f" N_raw = {neff_results['N_raw']} r(+15d) = {neff_results['r_at_tau_plus15d']:.4f}")
print()
for method, vals in neff_results["methods"].items():
ci = vals["CI_95_r"]
ci_str = f"[{ci[0]:.3f}, {ci[1]:.3f}]" if all(np.isfinite(c) for c in ci) else "N/A"
print(f" {method:<42} Neff={vals['Neff']:>6.0f} 95%CI_r={ci_str}")
print("=" * 70)
if __name__ == "__main__":
main()