2a — Replace physically invalid summed-Mw seismic metric with correct
log10(ΣE) where E=10^(1.5·Mw+4.8) J (Kanamori 1977). Updates
usgs.py (new seismic_energy_per_bin()), scripts 02/07/08.
Impact: r(+15d) raw 0.310→0.081, peak r 0.469→0.139; sinusoidal
BF 27.5→0.75 (constant model now preferred).
2b — HP λ derivation: λ_5 = 1600×(365/5)^4 ≈ 4.54×10^10 (Ravn & Uhlig
2002 rescaling); detrend robustness figure (HP/Butterworth/rolling
mean), all show r(+15d)<0.04.
2c — Neff comparison: Bartlett 2923, Bretherton 769, Monte Carlo 594;
MC 95% CI [-0.002, +0.158] straddles zero → raw r not significant
under most conservative estimate.
2d — Magnitude threshold (M≥4.5/5.0/6.0): r(+15d) range 0.050–0.079
(Δ=0.029 < 0.05), peak stable at τ=−525 d; no aftershock bias.
Paper (main.tex, refs.bib): update all numbers, add Kanamori/Bartlett/
Ravn–Uhlig refs, new sections for 2b–2d with figures and Neff table.
PDF recompiled (27 pages).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
807 lines
31 KiB
Python
807 lines
31 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
scripts/08_combined_timeseries.py
|
||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||
Combined analysis on the FULL series (1976-to-end), comparing in-sample
|
||
(1976-2019) vs out-of-sample (2020-to-end) vs combined significance.
|
||
|
||
Key analyses
|
||
------------
|
||
1. Overall significance on the full window (phase surrogates, GPU)
|
||
2. Rolling r(τ=+15 d) in 3-year windows, 1-year steps — full 50-year span
|
||
3. Sinusoid fit: period ∈ [9, 13] years to the rolling r time series
|
||
4. Bayes factor: constant vs 11-year sinusoidal model (BIC approximation)
|
||
5. Does appending OOS data strengthen or weaken significance?
|
||
6. Station roster A/B/C comparison
|
||
|
||
Outputs
|
||
-------
|
||
results/figs/full_series_with_envelope_fit.png
|
||
results/combined_analysis.json
|
||
results/combined_analysis_report.md
|
||
|
||
Usage
|
||
-----
|
||
python scripts/08_combined_timeseries.py
|
||
python scripts/08_combined_timeseries.py --study-end 2026-01-01 --n-surrogates 10000
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import logging
|
||
import math
|
||
import sys
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
import matplotlib
|
||
matplotlib.use("Agg")
|
||
import matplotlib.pyplot as plt
|
||
import matplotlib.gridspec as gridspec
|
||
import numpy as np
|
||
import pandas as pd
|
||
import scipy.optimize
|
||
import scipy.stats
|
||
import yaml
|
||
|
||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||
sys.path.insert(0, str(PROJECT_ROOT / "src"))
|
||
|
||
from crq.ingest.nmdb import load_station, resample_daily
|
||
from crq.ingest.usgs import load_usgs, seismic_energy_per_bin
|
||
from crq.stats.surrogates_gpu import (
|
||
surrogate_xcorr_test_gpu,
|
||
gpu_available,
|
||
_GPU_REASON,
|
||
)
|
||
from crq.stats.surrogates import n_eff_bretherton, p_to_sigma
|
||
from crq.ingest.station_roster import (
|
||
station_cr_series,
|
||
global_cr_index,
|
||
probe_station_coverage,
|
||
classify_stations,
|
||
)
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s %(levelname)-8s %(name)s — %(message)s",
|
||
datefmt="%Y-%m-%dT%H:%M:%S",
|
||
)
|
||
logger = logging.getLogger("crq.combined")
|
||
|
||
BIN_DAYS = 5
|
||
LAG_15D_BINS = 15 // BIN_DAYS # 3 bins
|
||
INSAMPLE_START = "1976-01-01"
|
||
INSAMPLE_END = "2019-12-31"
|
||
OOS_START = "2020-01-01"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Data loading
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _ref_index(study_start: str, study_end: str, bin_days: int) -> pd.DatetimeIndex:
|
||
t0 = pd.Timestamp(study_start)
|
||
t1 = pd.Timestamp(study_end)
|
||
n = (t1 - t0).days // bin_days + 1
|
||
return pd.DatetimeIndex([t0 + pd.Timedelta(days=i * bin_days) for i in range(n)])
|
||
|
||
|
||
def _bin_series(s: pd.Series, t0: pd.Timestamp, bin_days: int, agg: str = "mean") -> pd.Series:
|
||
days = (s.index - t0).days
|
||
bn = days // bin_days
|
||
bd = t0 + pd.to_timedelta(bn * bin_days, unit="D")
|
||
grp = s.groupby(bd)
|
||
return grp.sum() if agg == "sum" else grp.mean()
|
||
|
||
|
||
def load_full_window(
|
||
station_ids: list[str],
|
||
study_start: str,
|
||
study_end: str,
|
||
nmdb_dir: Path,
|
||
usgs_dir: Path,
|
||
bin_days: int = BIN_DAYS,
|
||
min_mag: float = 4.0,
|
||
min_stations: int = 3,
|
||
coverage_threshold: float = 0.60,
|
||
) -> tuple[pd.Series, pd.Series, pd.DatetimeIndex, int]:
|
||
t0 = pd.Timestamp(study_start)
|
||
t1 = pd.Timestamp(study_end)
|
||
start_year = t0.year
|
||
end_year = t1.year
|
||
ref_index = _ref_index(study_start, study_end, bin_days)
|
||
|
||
# Per-station CR
|
||
norm_cols: dict[str, pd.Series] = {}
|
||
for station in station_ids:
|
||
hourly = load_station(station, start_year, end_year, nmdb_dir)
|
||
if hourly.empty:
|
||
continue
|
||
daily_df = resample_daily(hourly, station, coverage_threshold=coverage_threshold)
|
||
daily = daily_df[station].loc[study_start:study_end]
|
||
n_valid = int(daily.notna().sum())
|
||
if n_valid < 30:
|
||
continue
|
||
mean_ = daily.mean()
|
||
if not (np.isfinite(mean_) and mean_ > 0):
|
||
continue
|
||
norm_cols[station] = (daily / mean_).dropna()
|
||
|
||
n_stations = len(norm_cols)
|
||
if n_stations == 0:
|
||
raise RuntimeError(f"No CR station data for {study_start}–{study_end}")
|
||
|
||
df_norm = pd.DataFrame(norm_cols)
|
||
n_valid_day = df_norm.notna().sum(axis=1)
|
||
global_daily = df_norm.mean(axis=1)
|
||
global_daily[n_valid_day < min(min_stations, n_stations)] = np.nan
|
||
|
||
cr_bin = _bin_series(global_daily, t0, bin_days).reindex(ref_index)
|
||
|
||
# Seismic: log10 of summed seismic energy (E = 10^(1.5·Mw+4.8) J)
|
||
events = load_usgs(start_year, end_year, usgs_dir)
|
||
seismic_bin = seismic_energy_per_bin(
|
||
events, study_start, study_end, bin_days, t0, min_mag=min_mag,
|
||
)
|
||
floor_val = float(seismic_bin.min())
|
||
seismic_bin = seismic_bin.reindex(ref_index, fill_value=floor_val)
|
||
|
||
common = cr_bin.index.intersection(seismic_bin.index)
|
||
cr_bin = cr_bin.reindex(common)
|
||
seismic_bin = seismic_bin.reindex(common)
|
||
|
||
logger.info(
|
||
"Window %s–%s: %d CR stations, %d bins, %d events",
|
||
study_start[:4], study_end[:4], n_stations, len(common), len(events),
|
||
)
|
||
return cr_bin, seismic_bin, ref_index, n_stations
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Rolling r(τ=15)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def compute_rolling_r15(
|
||
cr: pd.Series,
|
||
seismic: pd.Series,
|
||
window_bins: int,
|
||
step_bins: int,
|
||
lag_bins: int = LAG_15D_BINS,
|
||
) -> list[dict]:
|
||
"""
|
||
Rolling Pearson r at a fixed lag over the full series.
|
||
Returns list of {center_date, center_year, r, n_pairs, n_eff}.
|
||
"""
|
||
cr_arr = cr.to_numpy(dtype=np.float64)
|
||
sei_arr = seismic.to_numpy(dtype=np.float64)
|
||
dates = cr.index
|
||
T = len(cr_arr)
|
||
|
||
results = []
|
||
for i0 in range(0, T - window_bins + 1, step_bins):
|
||
i1 = i0 + window_bins
|
||
x = cr_arr[i0:i1]
|
||
y = sei_arr[i0:i1]
|
||
|
||
valid = np.isfinite(x) & np.isfinite(y)
|
||
if valid.sum() < 30:
|
||
continue
|
||
|
||
x_v = x[valid]; y_v = y[valid]
|
||
n = len(x_v)
|
||
L = lag_bins
|
||
|
||
if L >= n:
|
||
continue
|
||
|
||
n_pairs = n - L
|
||
xa = x_v[:n_pairs]; ya = y_v[L:L + n_pairs]
|
||
if n_pairs < 10:
|
||
continue
|
||
|
||
try:
|
||
r, _ = scipy.stats.pearsonr(xa, ya)
|
||
except Exception:
|
||
r = np.nan
|
||
|
||
n_eff = float(n_eff_bretherton(x_v, y_v))
|
||
center = dates[i0 + window_bins // 2]
|
||
results.append({
|
||
"center_date": str(center.date()),
|
||
"center_year": center.year + center.month / 12,
|
||
"r": float(r),
|
||
"n_pairs": n_pairs,
|
||
"n_eff": round(n_eff, 1),
|
||
})
|
||
|
||
return results
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Sinusoid fitting + Bayes factor (BIC)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def sinusoid_model(t: np.ndarray, A: float, P: float, phi: float, mu: float) -> np.ndarray:
|
||
return A * np.sin(2 * np.pi / P * t + phi) + mu
|
||
|
||
|
||
def fit_sinusoid(
|
||
years: np.ndarray,
|
||
r_vals: np.ndarray,
|
||
period_bounds: tuple[float, float] = (9.0, 13.0),
|
||
) -> dict:
|
||
"""
|
||
Fit constant and sinusoidal models to the rolling r time series.
|
||
|
||
Returns dict with model parameters, residuals, BIC, and Bayes factor.
|
||
"""
|
||
finite = np.isfinite(years) & np.isfinite(r_vals)
|
||
t = years[finite] - years[finite].mean() # centre time axis
|
||
r = r_vals[finite]
|
||
n = len(r)
|
||
if n < 8:
|
||
return {"status": "too_few_points", "n": n}
|
||
|
||
# Model A: constant (1 free parameter: mu)
|
||
mu_A = r.mean()
|
||
rss_A = float(np.sum((r - mu_A) ** 2))
|
||
k_A = 1
|
||
bic_A = n * math.log(rss_A / n) + k_A * math.log(n)
|
||
|
||
# Model B: A*sin(2π/P*t + φ) + mu (4 free parameters: A, P, φ, mu)
|
||
best_res = None
|
||
best_rss = np.inf
|
||
|
||
for P_init in np.linspace(period_bounds[0], period_bounds[1], 5):
|
||
for phi_init in np.linspace(0, 2 * math.pi, 4, endpoint=False):
|
||
try:
|
||
popt, _ = scipy.optimize.curve_fit(
|
||
sinusoid_model, t, r,
|
||
p0=[0.02, P_init, phi_init, mu_A],
|
||
bounds=(
|
||
[-0.3, period_bounds[0], -2*math.pi, -0.5],
|
||
[ 0.3, period_bounds[1], 4*math.pi, 0.5],
|
||
),
|
||
maxfev=10_000,
|
||
)
|
||
r_hat = sinusoid_model(t, *popt)
|
||
rss = float(np.sum((r - r_hat) ** 2))
|
||
if rss < best_rss:
|
||
best_rss = rss
|
||
best_res = popt
|
||
except Exception:
|
||
pass
|
||
|
||
if best_res is None:
|
||
return {"status": "fit_failed", "bic_A": bic_A, "rss_A": rss_A, "n": n}
|
||
|
||
A_fit, P_fit, phi_fit, mu_fit = best_res
|
||
k_B = 4
|
||
rss_B = best_rss
|
||
bic_B = n * math.log(max(rss_B, 1e-20) / n) + k_B * math.log(n)
|
||
d_bic = bic_A - bic_B # positive = B preferred
|
||
bf = math.exp(d_bic / 2) # Bayes factor B vs A
|
||
|
||
return {
|
||
"status": "ok",
|
||
"n": n,
|
||
"model_A_mu": float(mu_A),
|
||
"model_A_rss": float(rss_A),
|
||
"model_A_bic": float(bic_A),
|
||
"model_B_A": float(A_fit),
|
||
"model_B_period": float(P_fit),
|
||
"model_B_phi": float(phi_fit),
|
||
"model_B_mu": float(mu_fit),
|
||
"model_B_rss": float(rss_B),
|
||
"model_B_bic": float(bic_B),
|
||
"delta_bic": float(d_bic),
|
||
"bayes_factor": float(bf),
|
||
"preferred_model": "B (sinusoidal)" if d_bic > 0 else "A (constant)",
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Station roster analysis
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def roster_analysis(
|
||
station_ids: list[str],
|
||
nmdb_dir: Path,
|
||
insample_start: str,
|
||
insample_end: str,
|
||
oos_start: str,
|
||
oos_end: str,
|
||
usgs_dir: Path,
|
||
bin_days: int,
|
||
n_surr: int,
|
||
lag_bins: np.ndarray,
|
||
seed: int,
|
||
) -> dict:
|
||
"""
|
||
Compare surrogate-test p_global for station rosters A, B_oos, C.
|
||
|
||
Only runs if roster C (new stations) is non-empty.
|
||
Returns dict of roster_name -> p_global.
|
||
"""
|
||
windows = {
|
||
"in_sample": (insample_start, insample_end),
|
||
"out_of_sample": (oos_start, oos_end),
|
||
}
|
||
logger.info("Probing station coverage for roster classification …")
|
||
cov: dict[str, dict[str, float]] = {}
|
||
for sid in station_ids:
|
||
try:
|
||
cov[sid] = probe_station_coverage(sid, windows, nmdb_dir)
|
||
except Exception as exc:
|
||
logger.debug("station %s probe error: %s", sid, exc)
|
||
cov[sid] = {k: 0.0 for k in windows}
|
||
|
||
rosters = classify_stations(station_ids, cov,
|
||
in_sample_key="in_sample",
|
||
oos_key="out_of_sample")
|
||
logger.info(
|
||
"Rosters — A: %d B_oos: %d C (new): %d",
|
||
len(rosters["A"]), len(rosters["B_oos"]), len(rosters["C"]),
|
||
)
|
||
|
||
results = {
|
||
"roster_sizes": {k: len(v) for k, v in rosters.items()},
|
||
"rosters": rosters,
|
||
"p_global": {},
|
||
}
|
||
|
||
start_year = int(oos_start[:4])
|
||
end_year = int(oos_end[:4])
|
||
t0 = pd.Timestamp(oos_start)
|
||
ref_idx = _ref_index(oos_start, oos_end, bin_days)
|
||
|
||
# Seismic series (log10 summed energy, same for all rosters)
|
||
events = load_usgs(start_year, end_year, usgs_dir)
|
||
sei_bin = seismic_energy_per_bin(
|
||
events, oos_start, oos_end, bin_days, t0, min_mag=4.0,
|
||
)
|
||
floor_val = float(sei_bin.min())
|
||
sei_bin = sei_bin.reindex(ref_idx, fill_value=floor_val)
|
||
|
||
for label, roster in [("A", rosters["A"]),
|
||
("B_oos", rosters["B_oos"]),
|
||
("C", rosters["C"])]:
|
||
if not roster:
|
||
results["p_global"][label] = None
|
||
logger.info("Roster %s: empty — skip", label)
|
||
continue
|
||
|
||
norm_cols: dict[str, pd.Series] = {}
|
||
for sid in roster:
|
||
hourly = load_station(sid, start_year, end_year, nmdb_dir)
|
||
if hourly.empty:
|
||
continue
|
||
daily_df = resample_daily(hourly, sid)
|
||
daily = daily_df[sid].loc[oos_start:oos_end]
|
||
mean_ = daily.mean()
|
||
if not (np.isfinite(mean_) and mean_ > 0):
|
||
continue
|
||
norm_cols[sid] = (daily / mean_).dropna()
|
||
|
||
if not norm_cols:
|
||
results["p_global"][label] = None
|
||
continue
|
||
|
||
df_norm = pd.DataFrame(norm_cols)
|
||
n_valid_day = df_norm.notna().sum(axis=1)
|
||
global_daily = df_norm.mean(axis=1)
|
||
global_daily[n_valid_day < 1] = np.nan
|
||
cr_bin = _bin_series(global_daily, t0, bin_days).reindex(ref_idx)
|
||
|
||
common = cr_bin.notna() & sei_bin.notna()
|
||
x_arr = cr_bin[common].to_numpy(dtype=np.float32)
|
||
y_arr = sei_bin[common].to_numpy(dtype=np.float32)
|
||
|
||
if len(x_arr) < 50:
|
||
results["p_global"][label] = None
|
||
continue
|
||
|
||
sr = surrogate_xcorr_test_gpu(
|
||
x_arr, y_arr, lag_bins,
|
||
n_surrogates=n_surr, method="phase", seed=seed + hash(label) % 1000,
|
||
)
|
||
results["p_global"][label] = round(float(sr["p_global"]), 6)
|
||
logger.info("Roster %s (%d stations): p_global = %.4f",
|
||
label, len(roster), results["p_global"][label])
|
||
|
||
return results
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Figures
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def make_combined_figure(
|
||
rolling_full: list[dict],
|
||
sinusoid_fit: dict,
|
||
insample_end: str,
|
||
oos_start: str,
|
||
study_start: str,
|
||
study_end: str,
|
||
output_path: Path,
|
||
) -> None:
|
||
years = np.array([rw["center_year"] for rw in rolling_full])
|
||
rs = np.array([rw["r"] for rw in rolling_full])
|
||
|
||
fig = plt.figure(figsize=(16, 7))
|
||
gs = gridspec.GridSpec(2, 1, height_ratios=[3, 1], hspace=0.35)
|
||
ax1 = fig.add_subplot(gs[0])
|
||
ax2 = fig.add_subplot(gs[1])
|
||
|
||
# ── Panel 1: rolling r(τ=+15 d) ─────────────────────────────────────
|
||
insample_end_year = float(insample_end[:4]) + 1.0
|
||
oos_start_year = float(oos_start[:4])
|
||
|
||
in_mask = years <= insample_end_year
|
||
oos_mask = years > oos_start_year - 0.5
|
||
|
||
ax1.scatter(years[in_mask], rs[in_mask], s=20, color="steelblue", zorder=4,
|
||
label="In-sample (1976–2019)")
|
||
ax1.scatter(years[oos_mask], rs[oos_mask], s=20, color="tomato", zorder=4,
|
||
label="Out-of-sample (2020–)")
|
||
ax1.plot(years, rs, color="k", linewidth=0.6, alpha=0.4)
|
||
|
||
# Sinusoid fit overlay
|
||
if sinusoid_fit.get("status") == "ok":
|
||
P = sinusoid_fit["model_B_period"]
|
||
A = sinusoid_fit["model_B_A"]
|
||
phi = sinusoid_fit["model_B_phi"]
|
||
mu = sinusoid_fit["model_B_mu"]
|
||
t_c = years.mean()
|
||
t_dense = np.linspace(years.min(), years.max(), 500)
|
||
fit_vals = sinusoid_model(t_dense - t_c, A, P, phi, mu)
|
||
bf = sinusoid_fit["bayes_factor"]
|
||
ax1.plot(t_dense, fit_vals, color="darkorange", linewidth=2.0,
|
||
label=f"Sinusoid fit: P={P:.1f} yr, A={A:.3f} (BF={bf:.2f})")
|
||
|
||
ax1.axhline(0, color="k", linewidth=0.5)
|
||
ax1.axvline(oos_start_year, color="gray", linewidth=1.2,
|
||
linestyle="--", alpha=0.7, label="In-sample / OOS boundary")
|
||
ax1.set_ylabel(f"Pearson r(τ = +15 d)")
|
||
ax1.set_title(
|
||
f"Full-series rolling r(τ=+15 d) in 3-year windows (1-year step) | "
|
||
f"{study_start[:4]}–{study_end[:4]}",
|
||
fontsize=10,
|
||
)
|
||
ax1.legend(fontsize=9, loc="upper left")
|
||
ax1.grid(True, alpha=0.3)
|
||
|
||
# ── Panel 2: rolling n_eff ───────────────────────────────────────────
|
||
n_effs = np.array([rw["n_eff"] for rw in rolling_full])
|
||
ax2.fill_between(years, 0, n_effs, alpha=0.4, color="steelblue")
|
||
ax2.axvline(oos_start_year, color="gray", linewidth=1.2, linestyle="--", alpha=0.7)
|
||
ax2.set_xlabel("Year")
|
||
ax2.set_ylabel("Effective N")
|
||
ax2.set_title("Bretherton effective sample size per rolling window", fontsize=9)
|
||
ax2.grid(True, alpha=0.3)
|
||
|
||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
fig.savefig(output_path, dpi=150, bbox_inches="tight")
|
||
plt.close(fig)
|
||
logger.info("Combined figure saved: %s", output_path)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Report
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _fv(v, fmt: str) -> str:
|
||
"""Format a possibly-None float; returns 'N/A' when None/NaN."""
|
||
if v is None:
|
||
return "N/A"
|
||
try:
|
||
return format(float(v), fmt)
|
||
except (TypeError, ValueError):
|
||
return "N/A"
|
||
|
||
|
||
def write_combined_report(
|
||
results: dict,
|
||
sinusoid_fit: dict,
|
||
args: argparse.Namespace,
|
||
output_dir: Path,
|
||
) -> None:
|
||
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||
bf = sinusoid_fit.get("bayes_factor", float("nan"))
|
||
P = sinusoid_fit.get("model_B_period", float("nan"))
|
||
|
||
# Interpret Bayes factor (Jeffreys scale)
|
||
if np.isnan(bf):
|
||
bf_interp = "Could not be computed"
|
||
elif bf < 1:
|
||
bf_interp = f"BF = {bf:.2f} < 1: evidence FAVOURS constant model (no envelope)"
|
||
elif bf < 3:
|
||
bf_interp = f"BF = {bf:.2f}: anecdotal evidence for sinusoidal envelope"
|
||
elif bf < 10:
|
||
bf_interp = f"BF = {bf:.2f}: moderate evidence for sinusoidal envelope"
|
||
elif bf < 30:
|
||
bf_interp = f"BF = {bf:.2f}: strong evidence for sinusoidal envelope"
|
||
else:
|
||
bf_interp = f"BF = {bf:.2f}: very strong evidence for sinusoidal envelope"
|
||
|
||
md = f"""# Combined Full-Series Analysis (1976–{args.study_end[:4]})
|
||
|
||
Generated: {ts}
|
||
Full window: {args.study_start} → {args.study_end}
|
||
In-sample: {INSAMPLE_START} → {INSAMPLE_END}
|
||
Out-of-sample: {OOS_START} → {args.study_end}
|
||
GPU: {_GPU_REASON}
|
||
Surrogates: {args.n_surrogates:,} per window
|
||
|
||
## Does appending OOS data strengthen or weaken significance?
|
||
|
||
| Window | p_global | σ_surrogate | peak lag |
|
||
|---|---|---|---|
|
||
| In-sample (1976–2019) | {_fv(results.get('p_global_insample'), '.4f')} | {_fv(results.get('sigma_insample'), '.2f')} | {results.get('peak_lag_insample', 'N/A')} d |
|
||
| Out-of-sample (2020–{args.study_end[:4]}) | {_fv(results.get('p_global_oos'), '.4f')} | {_fv(results.get('sigma_oos'), '.2f')} | {results.get('peak_lag_oos', 'N/A')} d |
|
||
| Combined (1976–{args.study_end[:4]}) | {_fv(results.get('p_global_full'), '.4f')} | {_fv(results.get('sigma_full'), '.2f')} | {results.get('peak_lag_full', 'N/A')} d |
|
||
|
||
## Sinusoidal envelope fit
|
||
|
||
{bf_interp}
|
||
|
||
Best-fit period: **{P:.2f} years** (constrained to [9, 13] years)
|
||
|
||
| Parameter | Value |
|
||
|---|---|
|
||
| Period P | {sinusoid_fit.get('model_B_period', float('nan')):.2f} yr |
|
||
| Amplitude A | {sinusoid_fit.get('model_B_A', float('nan')):.4f} |
|
||
| Phase φ | {sinusoid_fit.get('model_B_phi', float('nan')):.2f} rad |
|
||
| Baseline μ | {sinusoid_fit.get('model_B_mu', float('nan')):.4f} |
|
||
| Model B BIC | {sinusoid_fit.get('model_B_bic', float('nan')):.2f} |
|
||
| Model A BIC | {sinusoid_fit.get('model_A_bic', float('nan')):.2f} |
|
||
| ΔBIC (A−B) | {sinusoid_fit.get('delta_bic', float('nan')):.2f} |
|
||
| Bayes factor (BF) | {bf:.3f} |
|
||
|
||
## Station roster comparison (OOS window)
|
||
|
||
| Roster | Description | Stations | p_global |
|
||
|---|---|---|---|
|
||
| A | In BOTH windows | {results.get('roster', {}).get('roster_sizes', {}).get('A', '?')} | {results.get('roster', {}).get('p_global', {}).get('A', 'N/A')} |
|
||
| B_oos | All OOS stations | {results.get('roster', {}).get('roster_sizes', {}).get('B_oos', '?')} | {results.get('roster', {}).get('p_global', {}).get('B_oos', 'N/A')} |
|
||
| C | New OOS-only | {results.get('roster', {}).get('roster_sizes', {}).get('C', '?')} | {results.get('roster', {}).get('p_global', {}).get('C', 'N/A')} |
|
||
|
||
A real effect should appear consistently across all three rosters.
|
||
Divergence (e.g., significant only in A) would suggest station-selection bias.
|
||
|
||
## Figure
|
||
`results/figs/full_series_with_envelope_fit.png`
|
||
"""
|
||
path = output_dir / "combined_analysis_report.md"
|
||
path.write_text(md, encoding="utf-8")
|
||
logger.info("Combined report saved: %s", path)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CLI
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _parse_args() -> argparse.Namespace:
|
||
p = argparse.ArgumentParser(
|
||
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
)
|
||
p.add_argument("--study-start", default=INSAMPLE_START)
|
||
p.add_argument("--study-end", default=None,
|
||
help="Override end date (default: read from data_availability.json)")
|
||
p.add_argument("--bin-days", type=int, default=BIN_DAYS)
|
||
p.add_argument("--lag-min", type=int, default=-200)
|
||
p.add_argument("--lag-max", type=int, default=+200)
|
||
p.add_argument("--min-mag", type=float, default=4.0)
|
||
p.add_argument("--n-surrogates", type=int, default=10_000,
|
||
help="Surrogates per window surrogate test (default 10 000)")
|
||
p.add_argument("--roll-window", type=int, default=None,
|
||
help="Rolling window bins (default: 3 years)")
|
||
p.add_argument("--roll-step", type=int, default=None,
|
||
help="Rolling step bins (default: 1 year)")
|
||
p.add_argument("--period-min", type=float, default=9.0)
|
||
p.add_argument("--period-max", type=float, default=13.0)
|
||
p.add_argument("--roster", action="store_true",
|
||
help="Run station roster A/B/C comparison (slow)")
|
||
p.add_argument("--seed", type=int, default=42)
|
||
p.add_argument("--nmdb-dir", type=Path, default=PROJECT_ROOT/"data"/"raw"/"nmdb")
|
||
p.add_argument("--usgs-dir", type=Path, default=PROJECT_ROOT/"data"/"raw"/"usgs")
|
||
p.add_argument("--config", type=Path, default=PROJECT_ROOT/"config"/"stations.yaml")
|
||
p.add_argument("--output-dir",type=Path, default=PROJECT_ROOT/"results")
|
||
return p.parse_args()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def run(args: argparse.Namespace) -> None:
|
||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||
(args.output_dir / "figs").mkdir(exist_ok=True)
|
||
|
||
# Determine study end
|
||
avail_path = args.output_dir / "data_availability.json"
|
||
if args.study_end is None:
|
||
if avail_path.exists():
|
||
avail = json.loads(avail_path.read_text())
|
||
args.study_end = avail["oos_end"]
|
||
logger.info("Study end from data_availability.json: %s", args.study_end)
|
||
else:
|
||
from datetime import date, timedelta
|
||
args.study_end = str(date.today() - timedelta(days=60))
|
||
logger.warning("data_availability.json not found; using %s", args.study_end)
|
||
|
||
with open(args.config) as fh:
|
||
cfg = yaml.safe_load(fh)
|
||
station_ids = list(cfg["stations"].keys())
|
||
|
||
lag_min_b = args.lag_min // args.bin_days
|
||
lag_max_b = args.lag_max // args.bin_days
|
||
lag_bins = np.arange(lag_min_b, lag_max_b + 1, dtype=int)
|
||
|
||
# Rolling window / step defaults
|
||
roll_window = args.roll_window or int(3 * 365.25 / args.bin_days) # 3 years
|
||
roll_step = args.roll_step or int(1 * 365.25 / args.bin_days) # 1 year
|
||
|
||
# ------------------------------------------------------------------ #
|
||
# 1. Load three windows #
|
||
# ------------------------------------------------------------------ #
|
||
logger.info("Loading full window %s–%s …", args.study_start, args.study_end)
|
||
cr_full, sei_full, _, n_st_full = load_full_window(
|
||
station_ids, args.study_start, args.study_end,
|
||
args.nmdb_dir, args.usgs_dir, args.bin_days, args.min_mag,
|
||
)
|
||
|
||
logger.info("Loading in-sample window %s–%s …", INSAMPLE_START, INSAMPLE_END)
|
||
try:
|
||
cr_in, sei_in, _, _ = load_full_window(
|
||
station_ids, INSAMPLE_START, INSAMPLE_END,
|
||
args.nmdb_dir, args.usgs_dir, args.bin_days, args.min_mag,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning("In-sample load failed: %s", exc)
|
||
cr_in = sei_in = None
|
||
|
||
logger.info("Loading OOS window %s–%s …", OOS_START, args.study_end)
|
||
try:
|
||
cr_oos, sei_oos, _, _ = load_full_window(
|
||
station_ids, OOS_START, args.study_end,
|
||
args.nmdb_dir, args.usgs_dir, args.bin_days, args.min_mag,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning("OOS load failed: %s", exc)
|
||
cr_oos = sei_oos = None
|
||
|
||
# ------------------------------------------------------------------ #
|
||
# 2. Surrogate significance per window #
|
||
# ------------------------------------------------------------------ #
|
||
def _run_surr(cr: pd.Series, sei: pd.Series, label: str) -> dict:
|
||
valid = cr.notna() & sei.notna()
|
||
x = cr[valid].to_numpy(dtype=np.float32)
|
||
y = sei[valid].to_numpy(dtype=np.float32)
|
||
if len(x) < 50:
|
||
return {"p_global": np.nan, "sigma": np.nan, "peak_lag_days": None}
|
||
sr = surrogate_xcorr_test_gpu(
|
||
x, y, lag_bins, n_surrogates=args.n_surrogates,
|
||
method="phase", seed=args.seed,
|
||
)
|
||
p = float(sr["p_global"])
|
||
sig = p_to_sigma(p) if p > 0 else float("inf")
|
||
pk = int(sr["observed_peak_lag"]) * args.bin_days
|
||
logger.info("%s: p_global=%.4f (%.2fσ), peak=%+d d", label, p, sig, pk)
|
||
return {"p_global": round(p, 6), "sigma": round(sig, 3),
|
||
"peak_lag_days": pk, "T": len(x)}
|
||
|
||
surr_full = _run_surr(cr_full, sei_full, "full")
|
||
surr_insample = _run_surr(cr_in, sei_in, "in-sample") if cr_in is not None else {}
|
||
surr_oos = _run_surr(cr_oos, sei_oos, "OOS") if cr_oos is not None else {}
|
||
|
||
# ------------------------------------------------------------------ #
|
||
# 3. Rolling r(τ=+15 d) — full series #
|
||
# ------------------------------------------------------------------ #
|
||
logger.info("Computing rolling r(τ=+15 d), window=%d bins, step=%d bins …",
|
||
roll_window, roll_step)
|
||
rolling_full = compute_rolling_r15(
|
||
cr_full, sei_full, roll_window, roll_step, lag_bins=LAG_15D_BINS,
|
||
)
|
||
logger.info("Rolling windows: %d", len(rolling_full))
|
||
|
||
# ------------------------------------------------------------------ #
|
||
# 4. Sinusoid fit + Bayes factor #
|
||
# ------------------------------------------------------------------ #
|
||
if rolling_full:
|
||
years_arr = np.array([rw["center_year"] for rw in rolling_full])
|
||
r_arr = np.array([rw["r"] for rw in rolling_full])
|
||
sinusoid_fit = fit_sinusoid(
|
||
years_arr, r_arr,
|
||
period_bounds=(args.period_min, args.period_max),
|
||
)
|
||
logger.info(
|
||
"Sinusoid fit: P=%.2f yr, BF=%.3f, ΔBIC=%.2f",
|
||
sinusoid_fit.get("model_B_period", np.nan),
|
||
sinusoid_fit.get("bayes_factor", np.nan),
|
||
sinusoid_fit.get("delta_bic", np.nan),
|
||
)
|
||
else:
|
||
sinusoid_fit = {"status": "no_rolling_data"}
|
||
|
||
# ------------------------------------------------------------------ #
|
||
# 5. Station roster comparison (optional) #
|
||
# ------------------------------------------------------------------ #
|
||
roster_res = {}
|
||
if args.roster:
|
||
roster_res = roster_analysis(
|
||
station_ids, args.nmdb_dir,
|
||
INSAMPLE_START, INSAMPLE_END,
|
||
OOS_START, args.study_end,
|
||
args.usgs_dir, args.bin_days,
|
||
min(args.n_surrogates, 1000), lag_bins, args.seed,
|
||
)
|
||
|
||
# ------------------------------------------------------------------ #
|
||
# 6. Figure #
|
||
# ------------------------------------------------------------------ #
|
||
make_combined_figure(
|
||
rolling_full, sinusoid_fit,
|
||
INSAMPLE_END, OOS_START,
|
||
args.study_start, args.study_end,
|
||
args.output_dir / "figs" / "full_series_with_envelope_fit.png",
|
||
)
|
||
|
||
# ------------------------------------------------------------------ #
|
||
# 7. JSON + report #
|
||
# ------------------------------------------------------------------ #
|
||
combined_results = {
|
||
"study_start": args.study_start,
|
||
"study_end": args.study_end,
|
||
"n_surrogates": args.n_surrogates,
|
||
"gpu_device": _GPU_REASON,
|
||
"roll_window_bins": roll_window,
|
||
"roll_step_bins": roll_step,
|
||
"p_global_full": surr_full.get("p_global"),
|
||
"sigma_full": surr_full.get("sigma"),
|
||
"peak_lag_full": surr_full.get("peak_lag_days"),
|
||
"T_full": surr_full.get("T"),
|
||
"p_global_insample": surr_insample.get("p_global"),
|
||
"sigma_insample": surr_insample.get("sigma"),
|
||
"peak_lag_insample": surr_insample.get("peak_lag_days"),
|
||
"p_global_oos": surr_oos.get("p_global"),
|
||
"sigma_oos": surr_oos.get("sigma"),
|
||
"peak_lag_oos": surr_oos.get("peak_lag_days"),
|
||
"sinusoid_fit": sinusoid_fit,
|
||
"roster": roster_res,
|
||
"n_rolling_windows": len(rolling_full),
|
||
"rolling_windows": rolling_full,
|
||
}
|
||
|
||
json_path = args.output_dir / "combined_analysis.json"
|
||
json_path.write_text(
|
||
json.dumps(combined_results, indent=2, default=str), encoding="utf-8"
|
||
)
|
||
logger.info("JSON saved: %s", json_path)
|
||
|
||
write_combined_report(combined_results, sinusoid_fit, args, args.output_dir)
|
||
|
||
# Summary
|
||
print()
|
||
print("=" * 72)
|
||
print(f" COMBINED ANALYSIS: {args.study_start[:4]}–{args.study_end[:4]}")
|
||
print("=" * 72)
|
||
print(f" Full: p={surr_full.get('p_global', float('nan')):.4f} "
|
||
f"σ={surr_full.get('sigma', float('nan')):.2f} "
|
||
f"peak={surr_full.get('peak_lag_days', '?')} d")
|
||
print(f" In-sample: p={surr_insample.get('p_global', float('nan')):.4f}")
|
||
print(f" OOS: p={surr_oos.get('p_global', float('nan')):.4f}")
|
||
bf_val = sinusoid_fit.get("bayes_factor", float("nan"))
|
||
P_val = sinusoid_fit.get("model_B_period", float("nan"))
|
||
print(f" Sinusoid: P={P_val:.2f} yr BF={bf_val:.3f} "
|
||
f"({'preferred' if bf_val > 1 else 'not preferred'} vs constant)")
|
||
print("=" * 72)
|
||
print()
|
||
logger.info("Done.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
run(_parse_args())
|