cosmicraysandearthquakes/scripts/08_combined_timeseries.py
root 798b0744b5 Fix seismic metric + robustness checks (issues 2a–2d)
2a — Replace physically invalid summed-Mw seismic metric with correct
     log10(ΣE) where E=10^(1.5·Mw+4.8) J (Kanamori 1977).  Updates
     usgs.py (new seismic_energy_per_bin()), scripts 02/07/08.
     Impact: r(+15d) raw 0.310→0.081, peak r 0.469→0.139; sinusoidal
     BF 27.5→0.75 (constant model now preferred).

2b — HP λ derivation: λ_5 = 1600×(365/5)^4 ≈ 4.54×10^10 (Ravn & Uhlig
     2002 rescaling); detrend robustness figure (HP/Butterworth/rolling
     mean), all show r(+15d)<0.04.

2c — Neff comparison: Bartlett 2923, Bretherton 769, Monte Carlo 594;
     MC 95% CI [-0.002, +0.158] straddles zero → raw r not significant
     under most conservative estimate.

2d — Magnitude threshold (M≥4.5/5.0/6.0): r(+15d) range 0.050–0.079
     (Δ=0.029 < 0.05), peak stable at τ=−525 d; no aftershock bias.

Paper (main.tex, refs.bib): update all numbers, add Kanamori/Bartlett/
Ravn–Uhlig refs, new sections for 2b–2d with figures and Neff table.
PDF recompiled (27 pages).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-24 14:49:54 +02:00

807 lines
31 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
scripts/08_combined_timeseries.py
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Combined analysis on the FULL series (1976-to-end), comparing in-sample
(1976-2019) vs out-of-sample (2020-to-end) vs combined significance.
Key analyses
------------
1. Overall significance on the full window (phase surrogates, GPU)
2. Rolling r(τ=+15 d) in 3-year windows, 1-year steps — full 50-year span
3. Sinusoid fit: period ∈ [9, 13] years to the rolling r time series
4. Bayes factor: constant vs 11-year sinusoidal model (BIC approximation)
5. Does appending OOS data strengthen or weaken significance?
6. Station roster A/B/C comparison
Outputs
-------
results/figs/full_series_with_envelope_fit.png
results/combined_analysis.json
results/combined_analysis_report.md
Usage
-----
python scripts/08_combined_timeseries.py
python scripts/08_combined_timeseries.py --study-end 2026-01-01 --n-surrogates 10000
"""
from __future__ import annotations
import argparse
import json
import logging
import math
import sys
from datetime import datetime, timezone
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import pandas as pd
import scipy.optimize
import scipy.stats
import yaml
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))
from crq.ingest.nmdb import load_station, resample_daily
from crq.ingest.usgs import load_usgs, seismic_energy_per_bin
from crq.stats.surrogates_gpu import (
surrogate_xcorr_test_gpu,
gpu_available,
_GPU_REASON,
)
from crq.stats.surrogates import n_eff_bretherton, p_to_sigma
from crq.ingest.station_roster import (
station_cr_series,
global_cr_index,
probe_station_coverage,
classify_stations,
)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(name)s%(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
logger = logging.getLogger("crq.combined")
BIN_DAYS = 5
LAG_15D_BINS = 15 // BIN_DAYS # 3 bins
INSAMPLE_START = "1976-01-01"
INSAMPLE_END = "2019-12-31"
OOS_START = "2020-01-01"
# ---------------------------------------------------------------------------
# Data loading
# ---------------------------------------------------------------------------
def _ref_index(study_start: str, study_end: str, bin_days: int) -> pd.DatetimeIndex:
t0 = pd.Timestamp(study_start)
t1 = pd.Timestamp(study_end)
n = (t1 - t0).days // bin_days + 1
return pd.DatetimeIndex([t0 + pd.Timedelta(days=i * bin_days) for i in range(n)])
def _bin_series(s: pd.Series, t0: pd.Timestamp, bin_days: int, agg: str = "mean") -> pd.Series:
days = (s.index - t0).days
bn = days // bin_days
bd = t0 + pd.to_timedelta(bn * bin_days, unit="D")
grp = s.groupby(bd)
return grp.sum() if agg == "sum" else grp.mean()
def load_full_window(
station_ids: list[str],
study_start: str,
study_end: str,
nmdb_dir: Path,
usgs_dir: Path,
bin_days: int = BIN_DAYS,
min_mag: float = 4.0,
min_stations: int = 3,
coverage_threshold: float = 0.60,
) -> tuple[pd.Series, pd.Series, pd.DatetimeIndex, int]:
t0 = pd.Timestamp(study_start)
t1 = pd.Timestamp(study_end)
start_year = t0.year
end_year = t1.year
ref_index = _ref_index(study_start, study_end, bin_days)
# Per-station CR
norm_cols: dict[str, pd.Series] = {}
for station in station_ids:
hourly = load_station(station, start_year, end_year, nmdb_dir)
if hourly.empty:
continue
daily_df = resample_daily(hourly, station, coverage_threshold=coverage_threshold)
daily = daily_df[station].loc[study_start:study_end]
n_valid = int(daily.notna().sum())
if n_valid < 30:
continue
mean_ = daily.mean()
if not (np.isfinite(mean_) and mean_ > 0):
continue
norm_cols[station] = (daily / mean_).dropna()
n_stations = len(norm_cols)
if n_stations == 0:
raise RuntimeError(f"No CR station data for {study_start}{study_end}")
df_norm = pd.DataFrame(norm_cols)
n_valid_day = df_norm.notna().sum(axis=1)
global_daily = df_norm.mean(axis=1)
global_daily[n_valid_day < min(min_stations, n_stations)] = np.nan
cr_bin = _bin_series(global_daily, t0, bin_days).reindex(ref_index)
# Seismic: log10 of summed seismic energy (E = 10^(1.5·Mw+4.8) J)
events = load_usgs(start_year, end_year, usgs_dir)
seismic_bin = seismic_energy_per_bin(
events, study_start, study_end, bin_days, t0, min_mag=min_mag,
)
floor_val = float(seismic_bin.min())
seismic_bin = seismic_bin.reindex(ref_index, fill_value=floor_val)
common = cr_bin.index.intersection(seismic_bin.index)
cr_bin = cr_bin.reindex(common)
seismic_bin = seismic_bin.reindex(common)
logger.info(
"Window %s%s: %d CR stations, %d bins, %d events",
study_start[:4], study_end[:4], n_stations, len(common), len(events),
)
return cr_bin, seismic_bin, ref_index, n_stations
# ---------------------------------------------------------------------------
# Rolling r(τ=15)
# ---------------------------------------------------------------------------
def compute_rolling_r15(
cr: pd.Series,
seismic: pd.Series,
window_bins: int,
step_bins: int,
lag_bins: int = LAG_15D_BINS,
) -> list[dict]:
"""
Rolling Pearson r at a fixed lag over the full series.
Returns list of {center_date, center_year, r, n_pairs, n_eff}.
"""
cr_arr = cr.to_numpy(dtype=np.float64)
sei_arr = seismic.to_numpy(dtype=np.float64)
dates = cr.index
T = len(cr_arr)
results = []
for i0 in range(0, T - window_bins + 1, step_bins):
i1 = i0 + window_bins
x = cr_arr[i0:i1]
y = sei_arr[i0:i1]
valid = np.isfinite(x) & np.isfinite(y)
if valid.sum() < 30:
continue
x_v = x[valid]; y_v = y[valid]
n = len(x_v)
L = lag_bins
if L >= n:
continue
n_pairs = n - L
xa = x_v[:n_pairs]; ya = y_v[L:L + n_pairs]
if n_pairs < 10:
continue
try:
r, _ = scipy.stats.pearsonr(xa, ya)
except Exception:
r = np.nan
n_eff = float(n_eff_bretherton(x_v, y_v))
center = dates[i0 + window_bins // 2]
results.append({
"center_date": str(center.date()),
"center_year": center.year + center.month / 12,
"r": float(r),
"n_pairs": n_pairs,
"n_eff": round(n_eff, 1),
})
return results
# ---------------------------------------------------------------------------
# Sinusoid fitting + Bayes factor (BIC)
# ---------------------------------------------------------------------------
def sinusoid_model(t: np.ndarray, A: float, P: float, phi: float, mu: float) -> np.ndarray:
return A * np.sin(2 * np.pi / P * t + phi) + mu
def fit_sinusoid(
years: np.ndarray,
r_vals: np.ndarray,
period_bounds: tuple[float, float] = (9.0, 13.0),
) -> dict:
"""
Fit constant and sinusoidal models to the rolling r time series.
Returns dict with model parameters, residuals, BIC, and Bayes factor.
"""
finite = np.isfinite(years) & np.isfinite(r_vals)
t = years[finite] - years[finite].mean() # centre time axis
r = r_vals[finite]
n = len(r)
if n < 8:
return {"status": "too_few_points", "n": n}
# Model A: constant (1 free parameter: mu)
mu_A = r.mean()
rss_A = float(np.sum((r - mu_A) ** 2))
k_A = 1
bic_A = n * math.log(rss_A / n) + k_A * math.log(n)
# Model B: A*sin(2π/P*t + φ) + mu (4 free parameters: A, P, φ, mu)
best_res = None
best_rss = np.inf
for P_init in np.linspace(period_bounds[0], period_bounds[1], 5):
for phi_init in np.linspace(0, 2 * math.pi, 4, endpoint=False):
try:
popt, _ = scipy.optimize.curve_fit(
sinusoid_model, t, r,
p0=[0.02, P_init, phi_init, mu_A],
bounds=(
[-0.3, period_bounds[0], -2*math.pi, -0.5],
[ 0.3, period_bounds[1], 4*math.pi, 0.5],
),
maxfev=10_000,
)
r_hat = sinusoid_model(t, *popt)
rss = float(np.sum((r - r_hat) ** 2))
if rss < best_rss:
best_rss = rss
best_res = popt
except Exception:
pass
if best_res is None:
return {"status": "fit_failed", "bic_A": bic_A, "rss_A": rss_A, "n": n}
A_fit, P_fit, phi_fit, mu_fit = best_res
k_B = 4
rss_B = best_rss
bic_B = n * math.log(max(rss_B, 1e-20) / n) + k_B * math.log(n)
d_bic = bic_A - bic_B # positive = B preferred
bf = math.exp(d_bic / 2) # Bayes factor B vs A
return {
"status": "ok",
"n": n,
"model_A_mu": float(mu_A),
"model_A_rss": float(rss_A),
"model_A_bic": float(bic_A),
"model_B_A": float(A_fit),
"model_B_period": float(P_fit),
"model_B_phi": float(phi_fit),
"model_B_mu": float(mu_fit),
"model_B_rss": float(rss_B),
"model_B_bic": float(bic_B),
"delta_bic": float(d_bic),
"bayes_factor": float(bf),
"preferred_model": "B (sinusoidal)" if d_bic > 0 else "A (constant)",
}
# ---------------------------------------------------------------------------
# Station roster analysis
# ---------------------------------------------------------------------------
def roster_analysis(
station_ids: list[str],
nmdb_dir: Path,
insample_start: str,
insample_end: str,
oos_start: str,
oos_end: str,
usgs_dir: Path,
bin_days: int,
n_surr: int,
lag_bins: np.ndarray,
seed: int,
) -> dict:
"""
Compare surrogate-test p_global for station rosters A, B_oos, C.
Only runs if roster C (new stations) is non-empty.
Returns dict of roster_name -> p_global.
"""
windows = {
"in_sample": (insample_start, insample_end),
"out_of_sample": (oos_start, oos_end),
}
logger.info("Probing station coverage for roster classification …")
cov: dict[str, dict[str, float]] = {}
for sid in station_ids:
try:
cov[sid] = probe_station_coverage(sid, windows, nmdb_dir)
except Exception as exc:
logger.debug("station %s probe error: %s", sid, exc)
cov[sid] = {k: 0.0 for k in windows}
rosters = classify_stations(station_ids, cov,
in_sample_key="in_sample",
oos_key="out_of_sample")
logger.info(
"Rosters — A: %d B_oos: %d C (new): %d",
len(rosters["A"]), len(rosters["B_oos"]), len(rosters["C"]),
)
results = {
"roster_sizes": {k: len(v) for k, v in rosters.items()},
"rosters": rosters,
"p_global": {},
}
start_year = int(oos_start[:4])
end_year = int(oos_end[:4])
t0 = pd.Timestamp(oos_start)
ref_idx = _ref_index(oos_start, oos_end, bin_days)
# Seismic series (log10 summed energy, same for all rosters)
events = load_usgs(start_year, end_year, usgs_dir)
sei_bin = seismic_energy_per_bin(
events, oos_start, oos_end, bin_days, t0, min_mag=4.0,
)
floor_val = float(sei_bin.min())
sei_bin = sei_bin.reindex(ref_idx, fill_value=floor_val)
for label, roster in [("A", rosters["A"]),
("B_oos", rosters["B_oos"]),
("C", rosters["C"])]:
if not roster:
results["p_global"][label] = None
logger.info("Roster %s: empty — skip", label)
continue
norm_cols: dict[str, pd.Series] = {}
for sid in roster:
hourly = load_station(sid, start_year, end_year, nmdb_dir)
if hourly.empty:
continue
daily_df = resample_daily(hourly, sid)
daily = daily_df[sid].loc[oos_start:oos_end]
mean_ = daily.mean()
if not (np.isfinite(mean_) and mean_ > 0):
continue
norm_cols[sid] = (daily / mean_).dropna()
if not norm_cols:
results["p_global"][label] = None
continue
df_norm = pd.DataFrame(norm_cols)
n_valid_day = df_norm.notna().sum(axis=1)
global_daily = df_norm.mean(axis=1)
global_daily[n_valid_day < 1] = np.nan
cr_bin = _bin_series(global_daily, t0, bin_days).reindex(ref_idx)
common = cr_bin.notna() & sei_bin.notna()
x_arr = cr_bin[common].to_numpy(dtype=np.float32)
y_arr = sei_bin[common].to_numpy(dtype=np.float32)
if len(x_arr) < 50:
results["p_global"][label] = None
continue
sr = surrogate_xcorr_test_gpu(
x_arr, y_arr, lag_bins,
n_surrogates=n_surr, method="phase", seed=seed + hash(label) % 1000,
)
results["p_global"][label] = round(float(sr["p_global"]), 6)
logger.info("Roster %s (%d stations): p_global = %.4f",
label, len(roster), results["p_global"][label])
return results
# ---------------------------------------------------------------------------
# Figures
# ---------------------------------------------------------------------------
def make_combined_figure(
rolling_full: list[dict],
sinusoid_fit: dict,
insample_end: str,
oos_start: str,
study_start: str,
study_end: str,
output_path: Path,
) -> None:
years = np.array([rw["center_year"] for rw in rolling_full])
rs = np.array([rw["r"] for rw in rolling_full])
fig = plt.figure(figsize=(16, 7))
gs = gridspec.GridSpec(2, 1, height_ratios=[3, 1], hspace=0.35)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
# ── Panel 1: rolling r(τ=+15 d) ─────────────────────────────────────
insample_end_year = float(insample_end[:4]) + 1.0
oos_start_year = float(oos_start[:4])
in_mask = years <= insample_end_year
oos_mask = years > oos_start_year - 0.5
ax1.scatter(years[in_mask], rs[in_mask], s=20, color="steelblue", zorder=4,
label="In-sample (19762019)")
ax1.scatter(years[oos_mask], rs[oos_mask], s=20, color="tomato", zorder=4,
label="Out-of-sample (2020)")
ax1.plot(years, rs, color="k", linewidth=0.6, alpha=0.4)
# Sinusoid fit overlay
if sinusoid_fit.get("status") == "ok":
P = sinusoid_fit["model_B_period"]
A = sinusoid_fit["model_B_A"]
phi = sinusoid_fit["model_B_phi"]
mu = sinusoid_fit["model_B_mu"]
t_c = years.mean()
t_dense = np.linspace(years.min(), years.max(), 500)
fit_vals = sinusoid_model(t_dense - t_c, A, P, phi, mu)
bf = sinusoid_fit["bayes_factor"]
ax1.plot(t_dense, fit_vals, color="darkorange", linewidth=2.0,
label=f"Sinusoid fit: P={P:.1f} yr, A={A:.3f} (BF={bf:.2f})")
ax1.axhline(0, color="k", linewidth=0.5)
ax1.axvline(oos_start_year, color="gray", linewidth=1.2,
linestyle="--", alpha=0.7, label="In-sample / OOS boundary")
ax1.set_ylabel(f"Pearson r(τ = +15 d)")
ax1.set_title(
f"Full-series rolling r(τ=+15 d) in 3-year windows (1-year step) | "
f"{study_start[:4]}{study_end[:4]}",
fontsize=10,
)
ax1.legend(fontsize=9, loc="upper left")
ax1.grid(True, alpha=0.3)
# ── Panel 2: rolling n_eff ───────────────────────────────────────────
n_effs = np.array([rw["n_eff"] for rw in rolling_full])
ax2.fill_between(years, 0, n_effs, alpha=0.4, color="steelblue")
ax2.axvline(oos_start_year, color="gray", linewidth=1.2, linestyle="--", alpha=0.7)
ax2.set_xlabel("Year")
ax2.set_ylabel("Effective N")
ax2.set_title("Bretherton effective sample size per rolling window", fontsize=9)
ax2.grid(True, alpha=0.3)
output_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close(fig)
logger.info("Combined figure saved: %s", output_path)
# ---------------------------------------------------------------------------
# Report
# ---------------------------------------------------------------------------
def _fv(v, fmt: str) -> str:
"""Format a possibly-None float; returns 'N/A' when None/NaN."""
if v is None:
return "N/A"
try:
return format(float(v), fmt)
except (TypeError, ValueError):
return "N/A"
def write_combined_report(
results: dict,
sinusoid_fit: dict,
args: argparse.Namespace,
output_dir: Path,
) -> None:
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
bf = sinusoid_fit.get("bayes_factor", float("nan"))
P = sinusoid_fit.get("model_B_period", float("nan"))
# Interpret Bayes factor (Jeffreys scale)
if np.isnan(bf):
bf_interp = "Could not be computed"
elif bf < 1:
bf_interp = f"BF = {bf:.2f} < 1: evidence FAVOURS constant model (no envelope)"
elif bf < 3:
bf_interp = f"BF = {bf:.2f}: anecdotal evidence for sinusoidal envelope"
elif bf < 10:
bf_interp = f"BF = {bf:.2f}: moderate evidence for sinusoidal envelope"
elif bf < 30:
bf_interp = f"BF = {bf:.2f}: strong evidence for sinusoidal envelope"
else:
bf_interp = f"BF = {bf:.2f}: very strong evidence for sinusoidal envelope"
md = f"""# Combined Full-Series Analysis (1976{args.study_end[:4]})
Generated: {ts}
Full window: {args.study_start}{args.study_end}
In-sample: {INSAMPLE_START}{INSAMPLE_END}
Out-of-sample: {OOS_START}{args.study_end}
GPU: {_GPU_REASON}
Surrogates: {args.n_surrogates:,} per window
## Does appending OOS data strengthen or weaken significance?
| Window | p_global | σ_surrogate | peak lag |
|---|---|---|---|
| In-sample (19762019) | {_fv(results.get('p_global_insample'), '.4f')} | {_fv(results.get('sigma_insample'), '.2f')} | {results.get('peak_lag_insample', 'N/A')} d |
| Out-of-sample (2020{args.study_end[:4]}) | {_fv(results.get('p_global_oos'), '.4f')} | {_fv(results.get('sigma_oos'), '.2f')} | {results.get('peak_lag_oos', 'N/A')} d |
| Combined (1976{args.study_end[:4]}) | {_fv(results.get('p_global_full'), '.4f')} | {_fv(results.get('sigma_full'), '.2f')} | {results.get('peak_lag_full', 'N/A')} d |
## Sinusoidal envelope fit
{bf_interp}
Best-fit period: **{P:.2f} years** (constrained to [9, 13] years)
| Parameter | Value |
|---|---|
| Period P | {sinusoid_fit.get('model_B_period', float('nan')):.2f} yr |
| Amplitude A | {sinusoid_fit.get('model_B_A', float('nan')):.4f} |
| Phase φ | {sinusoid_fit.get('model_B_phi', float('nan')):.2f} rad |
| Baseline μ | {sinusoid_fit.get('model_B_mu', float('nan')):.4f} |
| Model B BIC | {sinusoid_fit.get('model_B_bic', float('nan')):.2f} |
| Model A BIC | {sinusoid_fit.get('model_A_bic', float('nan')):.2f} |
| ΔBIC (AB) | {sinusoid_fit.get('delta_bic', float('nan')):.2f} |
| Bayes factor (BF) | {bf:.3f} |
## Station roster comparison (OOS window)
| Roster | Description | Stations | p_global |
|---|---|---|---|
| A | In BOTH windows | {results.get('roster', {}).get('roster_sizes', {}).get('A', '?')} | {results.get('roster', {}).get('p_global', {}).get('A', 'N/A')} |
| B_oos | All OOS stations | {results.get('roster', {}).get('roster_sizes', {}).get('B_oos', '?')} | {results.get('roster', {}).get('p_global', {}).get('B_oos', 'N/A')} |
| C | New OOS-only | {results.get('roster', {}).get('roster_sizes', {}).get('C', '?')} | {results.get('roster', {}).get('p_global', {}).get('C', 'N/A')} |
A real effect should appear consistently across all three rosters.
Divergence (e.g., significant only in A) would suggest station-selection bias.
## Figure
`results/figs/full_series_with_envelope_fit.png`
"""
path = output_dir / "combined_analysis_report.md"
path.write_text(md, encoding="utf-8")
logger.info("Combined report saved: %s", path)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument("--study-start", default=INSAMPLE_START)
p.add_argument("--study-end", default=None,
help="Override end date (default: read from data_availability.json)")
p.add_argument("--bin-days", type=int, default=BIN_DAYS)
p.add_argument("--lag-min", type=int, default=-200)
p.add_argument("--lag-max", type=int, default=+200)
p.add_argument("--min-mag", type=float, default=4.0)
p.add_argument("--n-surrogates", type=int, default=10_000,
help="Surrogates per window surrogate test (default 10 000)")
p.add_argument("--roll-window", type=int, default=None,
help="Rolling window bins (default: 3 years)")
p.add_argument("--roll-step", type=int, default=None,
help="Rolling step bins (default: 1 year)")
p.add_argument("--period-min", type=float, default=9.0)
p.add_argument("--period-max", type=float, default=13.0)
p.add_argument("--roster", action="store_true",
help="Run station roster A/B/C comparison (slow)")
p.add_argument("--seed", type=int, default=42)
p.add_argument("--nmdb-dir", type=Path, default=PROJECT_ROOT/"data"/"raw"/"nmdb")
p.add_argument("--usgs-dir", type=Path, default=PROJECT_ROOT/"data"/"raw"/"usgs")
p.add_argument("--config", type=Path, default=PROJECT_ROOT/"config"/"stations.yaml")
p.add_argument("--output-dir",type=Path, default=PROJECT_ROOT/"results")
return p.parse_args()
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def run(args: argparse.Namespace) -> None:
args.output_dir.mkdir(parents=True, exist_ok=True)
(args.output_dir / "figs").mkdir(exist_ok=True)
# Determine study end
avail_path = args.output_dir / "data_availability.json"
if args.study_end is None:
if avail_path.exists():
avail = json.loads(avail_path.read_text())
args.study_end = avail["oos_end"]
logger.info("Study end from data_availability.json: %s", args.study_end)
else:
from datetime import date, timedelta
args.study_end = str(date.today() - timedelta(days=60))
logger.warning("data_availability.json not found; using %s", args.study_end)
with open(args.config) as fh:
cfg = yaml.safe_load(fh)
station_ids = list(cfg["stations"].keys())
lag_min_b = args.lag_min // args.bin_days
lag_max_b = args.lag_max // args.bin_days
lag_bins = np.arange(lag_min_b, lag_max_b + 1, dtype=int)
# Rolling window / step defaults
roll_window = args.roll_window or int(3 * 365.25 / args.bin_days) # 3 years
roll_step = args.roll_step or int(1 * 365.25 / args.bin_days) # 1 year
# ------------------------------------------------------------------ #
# 1. Load three windows #
# ------------------------------------------------------------------ #
logger.info("Loading full window %s%s", args.study_start, args.study_end)
cr_full, sei_full, _, n_st_full = load_full_window(
station_ids, args.study_start, args.study_end,
args.nmdb_dir, args.usgs_dir, args.bin_days, args.min_mag,
)
logger.info("Loading in-sample window %s%s", INSAMPLE_START, INSAMPLE_END)
try:
cr_in, sei_in, _, _ = load_full_window(
station_ids, INSAMPLE_START, INSAMPLE_END,
args.nmdb_dir, args.usgs_dir, args.bin_days, args.min_mag,
)
except Exception as exc:
logger.warning("In-sample load failed: %s", exc)
cr_in = sei_in = None
logger.info("Loading OOS window %s%s", OOS_START, args.study_end)
try:
cr_oos, sei_oos, _, _ = load_full_window(
station_ids, OOS_START, args.study_end,
args.nmdb_dir, args.usgs_dir, args.bin_days, args.min_mag,
)
except Exception as exc:
logger.warning("OOS load failed: %s", exc)
cr_oos = sei_oos = None
# ------------------------------------------------------------------ #
# 2. Surrogate significance per window #
# ------------------------------------------------------------------ #
def _run_surr(cr: pd.Series, sei: pd.Series, label: str) -> dict:
valid = cr.notna() & sei.notna()
x = cr[valid].to_numpy(dtype=np.float32)
y = sei[valid].to_numpy(dtype=np.float32)
if len(x) < 50:
return {"p_global": np.nan, "sigma": np.nan, "peak_lag_days": None}
sr = surrogate_xcorr_test_gpu(
x, y, lag_bins, n_surrogates=args.n_surrogates,
method="phase", seed=args.seed,
)
p = float(sr["p_global"])
sig = p_to_sigma(p) if p > 0 else float("inf")
pk = int(sr["observed_peak_lag"]) * args.bin_days
logger.info("%s: p_global=%.4f (%.2fσ), peak=%+d d", label, p, sig, pk)
return {"p_global": round(p, 6), "sigma": round(sig, 3),
"peak_lag_days": pk, "T": len(x)}
surr_full = _run_surr(cr_full, sei_full, "full")
surr_insample = _run_surr(cr_in, sei_in, "in-sample") if cr_in is not None else {}
surr_oos = _run_surr(cr_oos, sei_oos, "OOS") if cr_oos is not None else {}
# ------------------------------------------------------------------ #
# 3. Rolling r(τ=+15 d) — full series #
# ------------------------------------------------------------------ #
logger.info("Computing rolling r(τ=+15 d), window=%d bins, step=%d bins …",
roll_window, roll_step)
rolling_full = compute_rolling_r15(
cr_full, sei_full, roll_window, roll_step, lag_bins=LAG_15D_BINS,
)
logger.info("Rolling windows: %d", len(rolling_full))
# ------------------------------------------------------------------ #
# 4. Sinusoid fit + Bayes factor #
# ------------------------------------------------------------------ #
if rolling_full:
years_arr = np.array([rw["center_year"] for rw in rolling_full])
r_arr = np.array([rw["r"] for rw in rolling_full])
sinusoid_fit = fit_sinusoid(
years_arr, r_arr,
period_bounds=(args.period_min, args.period_max),
)
logger.info(
"Sinusoid fit: P=%.2f yr, BF=%.3f, ΔBIC=%.2f",
sinusoid_fit.get("model_B_period", np.nan),
sinusoid_fit.get("bayes_factor", np.nan),
sinusoid_fit.get("delta_bic", np.nan),
)
else:
sinusoid_fit = {"status": "no_rolling_data"}
# ------------------------------------------------------------------ #
# 5. Station roster comparison (optional) #
# ------------------------------------------------------------------ #
roster_res = {}
if args.roster:
roster_res = roster_analysis(
station_ids, args.nmdb_dir,
INSAMPLE_START, INSAMPLE_END,
OOS_START, args.study_end,
args.usgs_dir, args.bin_days,
min(args.n_surrogates, 1000), lag_bins, args.seed,
)
# ------------------------------------------------------------------ #
# 6. Figure #
# ------------------------------------------------------------------ #
make_combined_figure(
rolling_full, sinusoid_fit,
INSAMPLE_END, OOS_START,
args.study_start, args.study_end,
args.output_dir / "figs" / "full_series_with_envelope_fit.png",
)
# ------------------------------------------------------------------ #
# 7. JSON + report #
# ------------------------------------------------------------------ #
combined_results = {
"study_start": args.study_start,
"study_end": args.study_end,
"n_surrogates": args.n_surrogates,
"gpu_device": _GPU_REASON,
"roll_window_bins": roll_window,
"roll_step_bins": roll_step,
"p_global_full": surr_full.get("p_global"),
"sigma_full": surr_full.get("sigma"),
"peak_lag_full": surr_full.get("peak_lag_days"),
"T_full": surr_full.get("T"),
"p_global_insample": surr_insample.get("p_global"),
"sigma_insample": surr_insample.get("sigma"),
"peak_lag_insample": surr_insample.get("peak_lag_days"),
"p_global_oos": surr_oos.get("p_global"),
"sigma_oos": surr_oos.get("sigma"),
"peak_lag_oos": surr_oos.get("peak_lag_days"),
"sinusoid_fit": sinusoid_fit,
"roster": roster_res,
"n_rolling_windows": len(rolling_full),
"rolling_windows": rolling_full,
}
json_path = args.output_dir / "combined_analysis.json"
json_path.write_text(
json.dumps(combined_results, indent=2, default=str), encoding="utf-8"
)
logger.info("JSON saved: %s", json_path)
write_combined_report(combined_results, sinusoid_fit, args, args.output_dir)
# Summary
print()
print("=" * 72)
print(f" COMBINED ANALYSIS: {args.study_start[:4]}{args.study_end[:4]}")
print("=" * 72)
print(f" Full: p={surr_full.get('p_global', float('nan')):.4f} "
f"σ={surr_full.get('sigma', float('nan')):.2f} "
f"peak={surr_full.get('peak_lag_days', '?')} d")
print(f" In-sample: p={surr_insample.get('p_global', float('nan')):.4f}")
print(f" OOS: p={surr_oos.get('p_global', float('nan')):.4f}")
bf_val = sinusoid_fit.get("bayes_factor", float("nan"))
P_val = sinusoid_fit.get("model_B_period", float("nan"))
print(f" Sinusoid: P={P_val:.2f} yr BF={bf_val:.3f} "
f"({'preferred' if bf_val > 1 else 'not preferred'} vs constant)")
print("=" * 72)
print()
logger.info("Done.")
if __name__ == "__main__":
run(_parse_args())