cosmicraysandearthquakes/scripts/02_homola_replication.py
root 798b0744b5 Fix seismic metric + robustness checks (issues 2a–2d)
2a — Replace physically invalid summed-Mw seismic metric with correct
     log10(ΣE) where E=10^(1.5·Mw+4.8) J (Kanamori 1977).  Updates
     usgs.py (new seismic_energy_per_bin()), scripts 02/07/08.
     Impact: r(+15d) raw 0.310→0.081, peak r 0.469→0.139; sinusoidal
     BF 27.5→0.75 (constant model now preferred).

2b — HP λ derivation: λ_5 = 1600×(365/5)^4 ≈ 4.54×10^10 (Ravn & Uhlig
     2002 rescaling); detrend robustness figure (HP/Butterworth/rolling
     mean), all show r(+15d)<0.04.

2c — Neff comparison: Bartlett 2923, Bretherton 769, Monte Carlo 594;
     MC 95% CI [-0.002, +0.158] straddles zero → raw r not significant
     under most conservative estimate.

2d — Magnitude threshold (M≥4.5/5.0/6.0): r(+15d) range 0.050–0.079
     (Δ=0.029 < 0.05), peak stable at τ=−525 d; no aftershock bias.

Paper (main.tex, refs.bib): update all numbers, add Kanamori/Bartlett/
Ravn–Uhlig refs, new sections for 2b–2d with figures and Neff table.
PDF recompiled (27 pages).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-24 14:49:54 +02:00

667 lines
25 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
scripts/02_homola_replication.py
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Reproduce the Homola et al. 2023 (JASTP 247, 106068) cross-correlation analysis
as literally as possible from the paper.
Method (from the paper)
-----------------------
* Data: NMDB pressure-corrected hourly neutron counts; USGS earthquake catalogue.
* CR index: Each station is normalised to its own long-term mean count rate
(so all stations become dimensionless relative-rate series around 1.0).
The global index is the mean across all stations that have valid data at
each 5-day bin.
* Seismic metric: sum of Mw for all M ≥ 4.0 events per 5-day bin (Homola
metric; zero for bins with no events).
* Binning: non-overlapping 5-day bins anchored at study_start.
* Correlation: Pearson r(τ) for τ ∈ [-1000, +1000] days (step = 5 days).
Convention: τ > 0 means CR leads seismic (at lag τ the seismic series is
advanced τ days into the future relative to the CR series).
* Significance: "naive" Pearson t-test p-value (treats bins as i.i.d. — this
is the inflated significance that Homola reports; reproduced here as the
baseline before phase-randomised correction).
Outputs
-------
results/figs/homola_replication.png — r(τ) plot with peak annotated
results/homola_replication.json — machine-readable summary
"""
from __future__ import annotations
import argparse
import json
import logging
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import scipy.stats
import yaml
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))
from crq.ingest.nmdb import load_station, resample_daily
from crq.ingest.usgs import load_usgs, seismic_energy_per_bin
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(name)s%(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
logger = logging.getLogger("crq.homola")
# ---------------------------------------------------------------------------
# Constants (matching Homola et al. 2023)
# ---------------------------------------------------------------------------
STUDY_START = "1976-01-01"
STUDY_END = "2019-12-31"
BIN_DAYS = 5
LAG_MIN_DAYS = -1000
LAG_MAX_DAYS = +1000
MIN_MAG = 4.0
SEED = 42
NMDB_COVERAGE_THRESHOLD = 0.60 # min valid hourly fraction per station-day
MIN_STATIONS_PER_BIN = 3 # global index → NaN if fewer stations valid
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _get_git_sha() -> str:
try:
r = subprocess.run(
["git", "-C", str(PROJECT_ROOT), "rev-parse", "--short", "HEAD"],
capture_output=True, text=True, timeout=5,
)
return r.stdout.strip() if r.returncode == 0 else "unknown"
except Exception:
return "unknown"
def _load_station_names(config_path: Path) -> list[str]:
with config_path.open() as fh:
return list(yaml.safe_load(fh)["stations"].keys())
def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument("--study-start", default=STUDY_START)
p.add_argument("--study-end", default=STUDY_END)
p.add_argument("--bin-days", type=int, default=BIN_DAYS)
p.add_argument("--lag-min", type=int, default=LAG_MIN_DAYS)
p.add_argument("--lag-max", type=int, default=LAG_MAX_DAYS)
p.add_argument("--min-mag", type=float, default=MIN_MAG)
p.add_argument("--min-stations", type=int, default=MIN_STATIONS_PER_BIN)
p.add_argument("--nmdb-dir", type=Path, default=PROJECT_ROOT / "data" / "raw" / "nmdb")
p.add_argument("--usgs-dir", type=Path, default=PROJECT_ROOT / "data" / "raw" / "usgs")
p.add_argument("--output-dir", type=Path, default=PROJECT_ROOT / "results")
p.add_argument("--config", type=Path, default=PROJECT_ROOT / "config" / "stations.yaml")
p.add_argument("--seed", type=int, default=SEED)
return p.parse_args()
def _bin_series(
series: pd.Series,
study_start: str,
bin_days: int,
agg: str = "mean",
) -> pd.Series:
"""
Resample a daily Series to *bin_days*-day bins anchored exactly at
*study_start*.
Uses manual floor-division binning so the anchor date is always respected,
regardless of pandas version or calendar effects.
Parameters
----------
agg : ``"mean"`` (default) or ``"sum"``
Aggregation function. Use ``"mean"`` for normalised CR rates;
use ``"sum"`` for the seismic Mw total.
"""
t0 = pd.Timestamp(study_start)
days_from_origin = (series.index - t0).days
bin_number = days_from_origin // bin_days
bin_dates = t0 + pd.to_timedelta(bin_number * bin_days, unit="D")
grouped = series.groupby(bin_dates)
return grouped.sum() if agg == "sum" else grouped.mean()
# ---------------------------------------------------------------------------
# Step 1 — Global cosmic-ray index
# ---------------------------------------------------------------------------
def build_cr_index(
station_names: list[str],
study_start: str,
study_end: str,
bin_days: int,
nmdb_dir: Path,
*,
coverage_threshold: float = NMDB_COVERAGE_THRESHOLD,
min_stations: int = MIN_STATIONS_PER_BIN,
) -> tuple[pd.Series, pd.DataFrame, int]:
"""
Load all available NMDB stations, normalise each to its mean count rate
over the study window, and return a *bin_days*-day-binned global index.
Returns
-------
cr_index : binned global CR series (NaN if < min_stations valid)
station_norm_daily: daily normalised series per station (for diagnostics)
effective_min : the min_stations threshold actually applied
"""
t0 = pd.Timestamp(study_start)
t1 = pd.Timestamp(study_end)
start_year, end_year = t0.year, t1.year
norm_cols: dict[str, pd.Series] = {}
for station in station_names:
hourly = load_station(station, start_year, end_year, nmdb_dir)
if hourly.empty:
continue
daily_df = resample_daily(hourly, station, coverage_threshold=coverage_threshold)
daily = daily_df[station].loc[study_start:study_end]
n_valid = int(daily.notna().sum())
if n_valid < 30:
continue
station_mean = daily.mean()
if not np.isfinite(station_mean) or station_mean <= 0:
logger.warning("station %s: non-positive mean — skipping", station)
continue
norm_cols[station] = daily / station_mean # dimensionless ~1.0
n_total = (t1 - t0).days + 1
coverage = 100 * n_valid / n_total
logger.info(
"station %-6s valid_days=%5d coverage=%5.1f%% mean=%.1f",
station, n_valid, coverage, station_mean,
)
n_loaded = len(norm_cols)
logger.info("Loaded %d stations with valid data.", n_loaded)
if n_loaded == 0:
raise RuntimeError(
"No valid station data found — run scripts/01_download_data.py first."
)
# Auto-adjust min_stations when data is sparse (e.g. subset test)
effective_min = min_stations
if n_loaded < min_stations:
effective_min = max(1, n_loaded - 1) if n_loaded > 1 else 1
logger.warning(
"Only %d stations loaded but min_stations=%d requested; "
"using effective_min=%d. Results with <%d stations are unreliable.",
n_loaded, min_stations, effective_min, min_stations,
)
station_norm_daily = pd.DataFrame(norm_cols)
# Global daily index
n_valid_per_day = station_norm_daily.notna().sum(axis=1)
global_daily = station_norm_daily.mean(axis=1) # NaN-safe
global_daily[n_valid_per_day < effective_min] = np.nan
# Bin to bin_days
cr_index = _bin_series(global_daily, study_start, bin_days)
cr_index.name = "cr_index"
n_valid_bins = int(cr_index.notna().sum())
logger.info(
"CR index: %d %d-day bins, %d non-NaN (%.1f%%)",
len(cr_index), bin_days, n_valid_bins,
100 * n_valid_bins / max(len(cr_index), 1),
)
return cr_index, station_norm_daily, effective_min
# ---------------------------------------------------------------------------
# Step 2 — Seismic metric
# ---------------------------------------------------------------------------
def build_seismic_metric(
study_start: str,
study_end: str,
bin_days: int,
usgs_dir: Path,
*,
min_mag: float = MIN_MAG,
cr_index: Optional[pd.Series] = None,
) -> pd.Series:
"""
Sum of Mw for all M ≥ *min_mag* events per *bin_days*-day bin.
Aligned to *cr_index*'s index if supplied; otherwise built from scratch.
Bins with no events → 0.0.
"""
t0 = pd.Timestamp(study_start)
t1 = pd.Timestamp(study_end)
events = load_usgs(t0.year, t1.year, usgs_dir)
if events.empty:
raise RuntimeError("No USGS data — run scripts/01_download_data.py first.")
events = events.loc[study_start:study_end]
events = events[events["mag"] >= min_mag]
logger.info(
"Seismic: %d events M≥%.1f in %s%s",
len(events), min_mag, study_start, study_end,
)
# Physically correct: E = 10^(1.5·Mw + 4.8) J per event, summed per bin,
# returned as log10(E_sum). See Kanamori (1977).
t0 = pd.Timestamp(study_start)
seismic = seismic_energy_per_bin(
events, study_start, study_end, bin_days, t0, min_mag=min_mag,
)
seismic = seismic.fillna(seismic.min()) # fill rare empty bins with floor
# Align to CR index bins
if cr_index is not None:
seismic = seismic.reindex(cr_index.index, fill_value=seismic.min())
nonnan_pct = 100 * seismic.notna().mean()
logger.info(
"Seismic metric (log10 E): %d bins, %.1f%% non-NaN, "
"range [%.1f, %.1f] log10(J)",
len(seismic), nonnan_pct, float(seismic.min()), float(seismic.max()),
)
return seismic
# ---------------------------------------------------------------------------
# Step 3 — Cross-correlation
# ---------------------------------------------------------------------------
def pearson_lag_correlation(
x: pd.Series,
y: pd.Series,
lag_bins: np.ndarray,
) -> pd.DataFrame:
"""
Pearson r(τ) for each lag in *lag_bins* (bin units).
Convention: τ > 0 means x leads y.
At lag τ > 0: x[0..N-τ] vs y[τ..N]
At lag τ < 0: x[|τ|..N] vs y[0..N-|τ|]
Returns DataFrame: lag_bins, r, p_value, n_pairs.
"""
x_arr = np.asarray(x, dtype=float)
y_arr = np.asarray(y, dtype=float)
N = len(x_arr)
rs, ps, ns = [], [], []
for lag in lag_bins:
if lag >= 0:
end = N - lag
if end <= 0:
rs.append(np.nan); ps.append(np.nan); ns.append(0)
continue
xa = x_arr[:end]
ya = y_arr[lag : lag + end]
else:
abslag = -lag
end = N - abslag
if end <= 0:
rs.append(np.nan); ps.append(np.nan); ns.append(0)
continue
xa = x_arr[abslag : abslag + end]
ya = y_arr[:end]
valid = np.isfinite(xa) & np.isfinite(ya)
n = int(valid.sum())
ns.append(n)
if n < 10:
rs.append(np.nan); ps.append(np.nan)
continue
r, p = scipy.stats.pearsonr(xa[valid], ya[valid])
rs.append(float(r)); ps.append(float(p))
return pd.DataFrame({"lag_bins": lag_bins, "r": rs, "p_value": ps, "n_pairs": ns})
def naive_sigma(p_value: float) -> float:
"""Two-tailed p → Gaussian sigma z = Φ⁻¹(1 p/2)."""
if not np.isfinite(p_value) or p_value <= 0:
return np.inf
return float(scipy.stats.norm.isf(p_value / 2.0))
# ---------------------------------------------------------------------------
# Step 4 — Figure
# ---------------------------------------------------------------------------
def make_figure(
lag_df: pd.DataFrame,
bin_days: int,
peak_lag_days: float,
peak_r: float,
n_stations: int,
study_start: str,
study_end: str,
naive_p: float,
sigma: float,
cr_5d: pd.Series,
seismic_5d: pd.Series,
n_stations_eff: int,
r_at_15: float,
sigma_at_15: float,
) -> plt.Figure:
lags_days = lag_df["lag_bins"].values * bin_days
r_vals = lag_df["r"].values
n0 = int(lag_df.loc[lag_df["lag_bins"] == 0, "n_pairs"].iloc[0])
naive_ci95 = 1.96 / np.sqrt(n0) if n0 > 4 else 0.0
# Naive r thresholds for σ=3 and σ=6 (two-tailed, N=n0)
def _r_for_sigma(s: float, n: int) -> float:
"""r value corresponding to naive σ=s for sample size n."""
p = 2.0 * scipy.stats.norm.sf(s)
t_crit = scipy.stats.t.isf(p / 2.0, df=n - 2)
return float(t_crit / np.sqrt(t_crit**2 + n - 2))
r_sigma3 = _r_for_sigma(3.0, n0) if n0 > 4 else np.nan
r_sigma6 = _r_for_sigma(6.0, n0) if n0 > 4 else np.nan
fig, axes = plt.subplots(
3, 1, figsize=(14, 12),
gridspec_kw={"height_ratios": [3, 1.4, 1.4]},
)
# ── Panel 1: r(τ) ──────────────────────────────────────────────────────
ax = axes[0]
ax.axhline(0, color="k", lw=0.8)
ax.axhline( naive_ci95, color="C0", lw=0.8, ls="--",
label=f"±{naive_ci95:.3f} naive 95 % CI (n={n0:,})")
ax.axhline(-naive_ci95, color="C0", lw=0.8, ls="--")
if np.isfinite(r_sigma3):
ax.axhline( r_sigma3, color="C2", lw=0.9, ls=":", alpha=0.7,
label=f"Naive 3σ (r={r_sigma3:.3f})")
ax.axhline(-r_sigma3, color="C2", lw=0.9, ls=":", alpha=0.7)
if np.isfinite(r_sigma6):
ax.axhline( r_sigma6, color="C1", lw=0.9, ls="-.", alpha=0.8,
label=f"Naive 6σ (r={r_sigma6:.3f}) ← Homola claim")
ax.axhline(-r_sigma6, color="C1", lw=0.9, ls="-.", alpha=0.8)
ax.axvline(0, color="k", lw=0.6, alpha=0.35)
ax.plot(lags_days, r_vals, color="k", lw=1.1, zorder=3)
# Mark the τ=+15d bin (Homola claimed signal)
lag15_bins = 15 // bin_days
row15 = lag_df[lag_df["lag_bins"] == lag15_bins]
if len(row15):
r15 = float(row15["r"].iloc[0])
if np.isfinite(r15):
s15_str = f"{sigma_at_15:.1f}σ" if np.isfinite(sigma_at_15) and sigma_at_15 < 100 else ">8σ"
ax.scatter([15], [r15], color="darkorange", s=60, zorder=6,
label=f"τ = +15 d: r = {r15:+.4f} ({s15_str} naive)")
ax.axvline(15, color="darkorange", lw=1.0, ls=":", alpha=0.6)
# Mark the overall peak
ax.axvline(peak_lag_days, color="crimson", lw=1.3, ls="--",
label=f"Peak τ = {peak_lag_days:+.0f} d, r = {peak_r:+.4f}")
ax.scatter([peak_lag_days], [peak_r], color="crimson", s=50, zorder=5)
r_clean = r_vals[np.isfinite(r_vals)]
ylo = min(-0.15, r_clean.min() * 1.15) if len(r_clean) else -0.15
yhi = max( 0.15, r_clean.max() * 1.15) if len(r_clean) else 0.15
ax.set_xlim(lags_days[0], lags_days[-1])
ax.set_ylim(ylo, yhi)
ax.set_xlabel("Lag τ (days) [τ > 0: CR leads seismic]")
ax.set_ylabel("Pearson r")
ax.set_title(
f"Homola et al. 2023 — replication baseline | "
f"{n_stations} NMDB stations | "
f"{study_start[:4]}{study_end[:4]} | "
f"5-day bins | M≥{MIN_MAG:.1f}",
fontsize=10,
)
ax.legend(fontsize=8.5, loc="upper right", ncol=2)
ax.grid(True, alpha=0.22)
sig_str = f"{sigma:.1f}σ" if np.isfinite(sigma) and sigma < 100 else ">8σ"
info = (
f"Peak: r = {peak_r:+.5f} at τ = {peak_lag_days:+.0f} d ({sig_str} naive)\n"
f"τ = +15 d: r = {r_at_15:+.5f} "
+ (f"({sigma_at_15:.1f}σ naive)" if np.isfinite(sigma_at_15) and sigma_at_15 < 100 else "")
+ f"\nn = {n0:,} valid bin-pairs at τ=0\n"
f"⚠ Naive — ignores autocorr., solar cycle, lag-scan penalty"
)
ax.text(
0.014, 0.975, info, transform=ax.transAxes,
va="top", ha="left", fontsize=8.5,
bbox=dict(boxstyle="round,pad=0.35", fc="lightyellow", alpha=0.92),
)
# ── Panel 2: CR index ──────────────────────────────────────────────────
ax2 = axes[1]
cr_dev = (cr_5d - 1.0) * 100 # % deviation from station-normalised mean
ax2.plot(cr_dev.index, cr_dev.values, lw=0.7, color="navy", alpha=0.85)
ax2.axhline(0, color="k", lw=0.5)
ax2.set_ylabel("CR deviation (%)")
ax2.set_title(
f"Global CR index ({n_stations} stations, each normalised to their long-term mean)",
fontsize=9,
)
ax2.grid(True, alpha=0.18)
ax2.set_xlim(cr_5d.index[0], cr_5d.index[-1])
# ── Panel 3: seismic metric ────────────────────────────────────────────
ax3 = axes[2]
ax3.fill_between(seismic_5d.index, seismic_5d.values, color="firebrick", alpha=0.6, lw=0)
ax3.set_ylabel(f"log₁₀(Σ E) [log₁₀ J] (M≥{MIN_MAG:.1f})")
ax3.set_title(
f"Global seismic metric (log₁₀ of summed energy per 5-day bin, "
f"E=10^(1.5·Mw+4.8) J)",
fontsize=9,
)
ax3.grid(True, alpha=0.18)
ax3.set_xlim(seismic_5d.index[0], seismic_5d.index[-1])
fig.tight_layout(pad=1.2)
return fig
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
args = _parse_args()
np.random.seed(args.seed)
git_sha = _get_git_sha()
logger.info("Homola replication | git=%s seed=%d", git_sha, args.seed)
logger.info("Study window: %s %s", args.study_start, args.study_end)
logger.info("Bin: %d days | Lags: %+d%+d days",
args.bin_days, args.lag_min, args.lag_max)
args.output_dir.mkdir(parents=True, exist_ok=True)
figs_dir = args.output_dir / "figs"
figs_dir.mkdir(parents=True, exist_ok=True)
# ── 1. CR index
station_names = _load_station_names(args.config)
cr_5d, station_norm_daily, effective_min = build_cr_index(
station_names,
args.study_start, args.study_end,
args.bin_days, args.nmdb_dir,
coverage_threshold=NMDB_COVERAGE_THRESHOLD,
min_stations=args.min_stations,
)
n_stations = station_norm_daily.shape[1]
# ── 2. Seismic metric
seismic_5d = build_seismic_metric(
args.study_start, args.study_end,
args.bin_days, args.usgs_dir,
min_mag=args.min_mag,
cr_index=cr_5d,
)
# ── 3. Data quality assessment
n_cr_bins = int(cr_5d.notna().sum())
study_days = (pd.Timestamp(args.study_end) - pd.Timestamp(args.study_start)).days
expect_bins = study_days // args.bin_days
coverage_pct = 100.0 * n_cr_bins / max(expect_bins, 1)
if coverage_pct < 50:
logger.warning(
"⚠ CR data coverage = %.1f%% of study window. "
"Results are not meaningful with less than half the data. "
"Run: python scripts/01_download_data.py (full run, no --subset).",
coverage_pct,
)
# ── 4. Cross-correlation
lag_bins = np.arange(
args.lag_min // args.bin_days,
args.lag_max // args.bin_days + 1,
dtype=int,
)
logger.info("Computing Pearson r(τ) for %d lags …", len(lag_bins))
lag_df = pearson_lag_correlation(cr_5d, seismic_5d, lag_bins)
# ── 5. Peak statistics
valid_mask = lag_df["r"].notna()
if not valid_mask.any():
raise RuntimeError(
"All correlation values are NaN — data is insufficient for analysis."
)
peak_idx = lag_df.loc[valid_mask, "r"].abs().idxmax()
peak_row = lag_df.loc[peak_idx]
peak_lag_bins = int(peak_row["lag_bins"])
peak_lag_days = peak_lag_bins * args.bin_days
peak_r = float(peak_row["r"])
naive_p = float(peak_row["p_value"])
n_at_peak = int(peak_row["n_pairs"])
sigma = naive_sigma(naive_p)
# Correlation specifically at the Homola ±15-day claim
lag15_bins = 15 // args.bin_days # = 3 bins
row15 = lag_df[lag_df["lag_bins"] == lag15_bins]
r_at_15 = float(row15["r"].iloc[0]) if len(row15) else np.nan
p_at_15 = float(row15["p_value"].iloc[0]) if len(row15) else np.nan
sigma_at_15 = naive_sigma(p_at_15) if np.isfinite(p_at_15) else np.nan
logger.info("" * 55)
logger.info("Peak r = %+.5f at τ = %+d days (%+d bins)",
peak_r, peak_lag_days, peak_lag_bins)
logger.info("Naive p = %.3e (%.1fσ)", naive_p,
sigma if np.isfinite(sigma) else 999)
logger.info("n pairs at peak = %d", n_at_peak)
logger.info("r at τ = +15 d = %+.5f (naive p = %.2e, %.1fσ)",
r_at_15, p_at_15 if np.isfinite(p_at_15) else 1,
sigma_at_15 if np.isfinite(sigma_at_15) else 0)
logger.info("" * 55)
logger.info(
"NOTE: naive significance ignores autocorrelation, the shared solar-"
"cycle trend, and scanning over %d lags — expect gross over-significance.",
len(lag_bins),
)
# ── 6. Figure
fig = make_figure(
lag_df, args.bin_days, peak_lag_days, peak_r,
n_stations, args.study_start, args.study_end,
naive_p, sigma, cr_5d, seismic_5d,
n_stations_eff=effective_min,
r_at_15=r_at_15,
sigma_at_15=sigma_at_15,
)
fig_path = figs_dir / "homola_replication.png"
fig.savefig(fig_path, dpi=150, bbox_inches="tight")
plt.close(fig)
logger.info("Saved: %s", fig_path)
# ── 7. JSON log
results = {
"script": "scripts/02_homola_replication.py",
"git_sha": git_sha,
"seed": args.seed,
"timestamp_utc": datetime.now(timezone.utc).isoformat(),
"study_window": [args.study_start, args.study_end],
"bin_days": args.bin_days,
"lag_range_days": [args.lag_min, args.lag_max],
"min_magnitude": args.min_mag,
"nmdb_coverage_threshold": NMDB_COVERAGE_THRESHOLD,
"min_stations_requested": args.min_stations,
"min_stations_effective": effective_min,
"n_stations_used": n_stations,
"n_cr_bins_total": int(len(cr_5d)),
"n_cr_bins_valid": n_cr_bins,
"cr_data_coverage_pct": round(coverage_pct, 2),
"peak_lag_days": peak_lag_days,
"peak_lag_bins": peak_lag_bins,
"peak_r": round(peak_r, 6),
"naive_p_value": float(f"{naive_p:.6e}"),
"naive_sigma": round(sigma, 3) if np.isfinite(sigma) and sigma < 1e6 else None,
"n_pairs_at_peak": n_at_peak,
"r_at_tau_15d": round(r_at_15, 6) if np.isfinite(r_at_15) else None,
"p_at_tau_15d": float(f"{p_at_15:.6e}") if np.isfinite(p_at_15) else None,
"sigma_at_tau_15d": round(sigma_at_15, 3) if np.isfinite(sigma_at_15) else None,
"n_lags_scanned": len(lag_bins),
"note_significance": (
"Naive p from Pearson t-test treating 5-day bins as i.i.d. "
"Not corrected for: (1) temporal autocorrelation, "
"(2) shared solar-cycle trend, (3) scan over multiple lags. "
"Expected to be grossly over-significant — corrected tests in phase 2."
),
}
json_path = args.output_dir / "homola_replication.json"
json_path.write_text(json.dumps(results, indent=2))
logger.info("Saved: %s", json_path)
# ── Human summary
print()
print("=" * 62)
print(" HOMOLA ET AL. 2023 — REPLICATION SUMMARY")
print("=" * 62)
print(f" Stations used : {n_stations}")
print(f" CR data coverage : {coverage_pct:.1f}% of study window")
print(f" Valid 5-day bins (CR) : {n_cr_bins:,}")
print()
print(f" Peak |r| : {abs(peak_r):.5f} (r = {peak_r:+.5f})")
print(f" Peak lag τ : {peak_lag_days:+d} days")
print(f" Naive p-value at peak : {naive_p:.3e}")
sstr = f"{sigma:.2f}σ" if np.isfinite(sigma) and sigma < 1e6 else ">8σ"
print(f" Naive sigma at peak : {sstr}")
print()
s15str = f"{sigma_at_15:.2f}σ" if np.isfinite(sigma_at_15) else "N/A"
print(f" r at τ = +15 d (claim) : {r_at_15:+.5f} ({s15str})")
print()
if coverage_pct < 50:
print(" ⚠ WARNING: data coverage < 50% — results not meaningful.")
print(" python scripts/01_download_data.py (full run)")
else:
print(" ✓ Data coverage sufficient.")
print("=" * 62)
if __name__ == "__main__":
main()