#!/usr/bin/env python3 """ scripts/catalog_completeness_check.py ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Plot and document USGS catalogue completeness as a function of magnitude threshold and time. Outputs ------- results/catalog_completeness.png — event count per year at 6 thresholds results/catalog_completeness.txt — text summary of effective start years Usage ----- python scripts/catalog_completeness_check.py python scripts/catalog_completeness_check.py --data-dir /path/to/data/raw/usgs """ from __future__ import annotations import argparse import sys from pathlib import Path import matplotlib matplotlib.use("Agg") # headless — no display required import matplotlib.pyplot as plt import numpy as np import pandas as pd PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT / "src")) from crq.ingest.usgs import load_usgs # Magnitude thresholds to test THRESHOLDS = [4.0, 4.5, 5.0, 5.5, 6.0, 7.0] COLORS = ["#e41a1c", "#ff7f00", "#4daf4a", "#377eb8", "#984ea3", "#a65628"] def _parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("--data-dir", type=Path, default=PROJECT_ROOT / "data" / "raw" / "usgs") p.add_argument("--output-dir", type=Path, default=PROJECT_ROOT / "results") p.add_argument("--start-year", type=int, default=1960) p.add_argument("--end-year", type=int, default=2019) return p.parse_args() def _load_events(data_dir: Path, start_year: int, end_year: int) -> pd.DataFrame: return load_usgs(start_year, end_year, data_dir) def _annual_counts(events: pd.DataFrame, threshold: float) -> pd.Series: """Events per year at or above *threshold*.""" filtered = events[events["mag"] >= threshold] return filtered.groupby(filtered.index.year).size() def _stability_year(annual: pd.Series, window: int = 5, cv_max: float = 0.15) -> int | None: """ Estimate the first year after which the annual event count stabilises. Uses a sliding window coefficient of variation (std/mean). Returns the first year where the CV of the subsequent *window*-year block falls below *cv_max*, or None if the series never stabilises. """ years = annual.index.tolist() values = annual.values.astype(float) for i in range(len(years) - window): block = values[i : i + window] if np.mean(block) > 0 and np.std(block) / np.mean(block) < cv_max: return years[i] return None def _gutenberg_richter(events: pd.DataFrame, mag_min: float = 4.5, mag_max: float = 9.5) -> tuple[float, float]: """ Fit Gutenberg-Richter b-value using maximum likelihood (Aki 1965). Returns (b, a) for log10(N) = a - b·M. """ mags = events["mag"].dropna() mags = mags[(mags >= mag_min) & (mags <= mag_max)] if len(mags) < 10: return np.nan, np.nan b = np.log10(np.e) / (mags.mean() - mag_min) a = np.log10(len(mags)) + b * mag_min return float(b), float(a) def run(args: argparse.Namespace) -> None: args.output_dir.mkdir(parents=True, exist_ok=True) print(f"Loading USGS events from {args.data_dir} …") events = _load_events(args.data_dir, args.start_year, args.end_year) if events.empty: print("ERROR: no events loaded — run 01_download_data.py first") sys.exit(1) print(f" {len(events):,} events loaded, spanning " f"{events.index.min().year}–{events.index.max().year}") # ---------------------------------------------------------------- # Figure 1: annual event counts per magnitude threshold # ---------------------------------------------------------------- fig, axes = plt.subplots(2, 1, figsize=(14, 10), gridspec_kw={"height_ratios": [3, 1]}) ax_count, ax_ratio = axes year_range = np.arange(args.start_year, args.end_year + 1) summary_lines: list[str] = [] annual_all: dict[float, pd.Series] = {} for thresh, color in zip(THRESHOLDS, COLORS): annual = _annual_counts(events, thresh).reindex(year_range, fill_value=0) annual_all[thresh] = annual ax_count.plot(annual.index, annual.values, color=color, label=f"M ≥ {thresh:.1f}", linewidth=1.5 if thresh != 4.5 else 2.5) stab = _stability_year(annual) marker = f"stable from ~{stab}" if stab else "no clear stabilisation" summary_lines.append(f"M ≥ {thresh:.1f}: {marker}") ax_count.set_ylabel("Events per year") ax_count.set_title("USGS Earthquake Catalogue — Annual Event Counts by Magnitude Threshold") ax_count.legend(loc="upper left") ax_count.grid(True, alpha=0.3) ax_count.set_xlim(args.start_year, args.end_year) # Panel 2: ratio M≥4.5 / M≥5.0 — should be roughly constant when both complete r45 = annual_all[4.5] r50 = annual_all[5.0] ratio = (r45 / r50.replace(0, np.nan)).replace([np.inf, -np.inf], np.nan) ax_ratio.plot(ratio.index, ratio.values, color="black", linewidth=1.0) ax_ratio.axhline(ratio.loc[1990:2019].median(), color="red", linestyle="--", label=f"Median (1990-2019): {ratio.loc[1990:2019].median():.1f}") ax_ratio.set_ylabel("Ratio M≥4.5 / M≥5.0") ax_ratio.set_xlabel("Year") ax_ratio.legend() ax_ratio.grid(True, alpha=0.3) ax_ratio.set_xlim(args.start_year, args.end_year) fig.tight_layout() plot_path = args.output_dir / "catalog_completeness.png" fig.savefig(plot_path, dpi=150, bbox_inches="tight") plt.close(fig) print(f"Saved: {plot_path}") # ---------------------------------------------------------------- # Figure 2: Gutenberg-Richter frequency-magnitude diagram # ---------------------------------------------------------------- fig2, ax2 = plt.subplots(figsize=(8, 6)) mag_bins = np.arange(4.0, 9.5, 0.2) counts_ge = [(events["mag"] >= m).sum() for m in mag_bins] ax2.semilogy(mag_bins, counts_ge, "ko", markersize=4, label="Observed N(≥M)") b, a = _gutenberg_richter(events) if not np.isnan(b): m_fit = np.linspace(4.5, 9.0, 100) n_fit = 10 ** (a - b * m_fit) ax2.semilogy(m_fit, n_fit, "r-", label=f"G-R fit: b={b:.2f}") ax2.set_xlabel("Magnitude M") ax2.set_ylabel("Cumulative count N(≥M)") ax2.set_title("Gutenberg-Richter Frequency-Magnitude Distribution") ax2.legend() ax2.grid(True, which="both", alpha=0.3) gr_path = args.output_dir / "gutenberg_richter.png" fig2.savefig(gr_path, dpi=150, bbox_inches="tight") plt.close(fig2) print(f"Saved: {gr_path}") # ---------------------------------------------------------------- # Text summary # ---------------------------------------------------------------- txt_path = args.output_dir / "catalog_completeness.txt" lines = [ "USGS Earthquake Catalogue — Completeness Summary", "=" * 50, f"Total events loaded: {len(events):,}", f"Date range: {events.index.min().date()} — {events.index.max().date()}", f"Gutenberg-Richter b-value (M≥4.5): {b:.3f}" if not np.isnan(b) else "G-R fit: insufficient data", "", "Estimated catalogue completeness onset (CV < 15% over 5-year window):", ] + [f" {s}" for s in summary_lines] + [ "", "NOTE: Use M≥4.5 from ~1976 onwards for global cross-correlation analysis.", "Earlier data should be treated with caution — counts are systematically low.", ] txt_path.write_text("\n".join(lines)) print(f"Saved: {txt_path}") print() for line in lines: print(line) if __name__ == "__main__": run(_parse_args())