198 lines
7.6 KiB
Python
198 lines
7.6 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
scripts/catalog_completeness_check.py
|
|||
|
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|||
|
|
Plot and document USGS catalogue completeness as a function of magnitude
|
|||
|
|
threshold and time.
|
|||
|
|
|
|||
|
|
Outputs
|
|||
|
|
-------
|
|||
|
|
results/catalog_completeness.png — event count per year at 6 thresholds
|
|||
|
|
results/catalog_completeness.txt — text summary of effective start years
|
|||
|
|
|
|||
|
|
Usage
|
|||
|
|
-----
|
|||
|
|
python scripts/catalog_completeness_check.py
|
|||
|
|
python scripts/catalog_completeness_check.py --data-dir /path/to/data/raw/usgs
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import argparse
|
|||
|
|
import sys
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
import matplotlib
|
|||
|
|
matplotlib.use("Agg") # headless — no display required
|
|||
|
|
import matplotlib.pyplot as plt
|
|||
|
|
import numpy as np
|
|||
|
|
import pandas as pd
|
|||
|
|
|
|||
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|||
|
|
sys.path.insert(0, str(PROJECT_ROOT / "src"))
|
|||
|
|
|
|||
|
|
from crq.ingest.usgs import load_usgs
|
|||
|
|
|
|||
|
|
# Magnitude thresholds to test
|
|||
|
|
THRESHOLDS = [4.0, 4.5, 5.0, 5.5, 6.0, 7.0]
|
|||
|
|
COLORS = ["#e41a1c", "#ff7f00", "#4daf4a", "#377eb8", "#984ea3", "#a65628"]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _parse_args() -> argparse.Namespace:
|
|||
|
|
p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
|
|||
|
|
p.add_argument("--data-dir", type=Path, default=PROJECT_ROOT / "data" / "raw" / "usgs")
|
|||
|
|
p.add_argument("--output-dir", type=Path, default=PROJECT_ROOT / "results")
|
|||
|
|
p.add_argument("--start-year", type=int, default=1960)
|
|||
|
|
p.add_argument("--end-year", type=int, default=2019)
|
|||
|
|
return p.parse_args()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _load_events(data_dir: Path, start_year: int, end_year: int) -> pd.DataFrame:
|
|||
|
|
return load_usgs(start_year, end_year, data_dir)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _annual_counts(events: pd.DataFrame, threshold: float) -> pd.Series:
|
|||
|
|
"""Events per year at or above *threshold*."""
|
|||
|
|
filtered = events[events["mag"] >= threshold]
|
|||
|
|
return filtered.groupby(filtered.index.year).size()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _stability_year(annual: pd.Series, window: int = 5, cv_max: float = 0.15) -> int | None:
|
|||
|
|
"""
|
|||
|
|
Estimate the first year after which the annual event count stabilises.
|
|||
|
|
|
|||
|
|
Uses a sliding window coefficient of variation (std/mean). Returns the
|
|||
|
|
first year where the CV of the subsequent *window*-year block falls below
|
|||
|
|
*cv_max*, or None if the series never stabilises.
|
|||
|
|
"""
|
|||
|
|
years = annual.index.tolist()
|
|||
|
|
values = annual.values.astype(float)
|
|||
|
|
for i in range(len(years) - window):
|
|||
|
|
block = values[i : i + window]
|
|||
|
|
if np.mean(block) > 0 and np.std(block) / np.mean(block) < cv_max:
|
|||
|
|
return years[i]
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _gutenberg_richter(events: pd.DataFrame, mag_min: float = 4.5, mag_max: float = 9.5) -> tuple[float, float]:
|
|||
|
|
"""
|
|||
|
|
Fit Gutenberg-Richter b-value using maximum likelihood (Aki 1965).
|
|||
|
|
|
|||
|
|
Returns (b, a) for log10(N) = a - b·M.
|
|||
|
|
"""
|
|||
|
|
mags = events["mag"].dropna()
|
|||
|
|
mags = mags[(mags >= mag_min) & (mags <= mag_max)]
|
|||
|
|
if len(mags) < 10:
|
|||
|
|
return np.nan, np.nan
|
|||
|
|
b = np.log10(np.e) / (mags.mean() - mag_min)
|
|||
|
|
a = np.log10(len(mags)) + b * mag_min
|
|||
|
|
return float(b), float(a)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run(args: argparse.Namespace) -> None:
|
|||
|
|
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|||
|
|
|
|||
|
|
print(f"Loading USGS events from {args.data_dir} …")
|
|||
|
|
events = _load_events(args.data_dir, args.start_year, args.end_year)
|
|||
|
|
if events.empty:
|
|||
|
|
print("ERROR: no events loaded — run 01_download_data.py first")
|
|||
|
|
sys.exit(1)
|
|||
|
|
print(f" {len(events):,} events loaded, spanning "
|
|||
|
|
f"{events.index.min().year}–{events.index.max().year}")
|
|||
|
|
|
|||
|
|
# ----------------------------------------------------------------
|
|||
|
|
# Figure 1: annual event counts per magnitude threshold
|
|||
|
|
# ----------------------------------------------------------------
|
|||
|
|
fig, axes = plt.subplots(2, 1, figsize=(14, 10),
|
|||
|
|
gridspec_kw={"height_ratios": [3, 1]})
|
|||
|
|
ax_count, ax_ratio = axes
|
|||
|
|
|
|||
|
|
year_range = np.arange(args.start_year, args.end_year + 1)
|
|||
|
|
summary_lines: list[str] = []
|
|||
|
|
annual_all: dict[float, pd.Series] = {}
|
|||
|
|
|
|||
|
|
for thresh, color in zip(THRESHOLDS, COLORS):
|
|||
|
|
annual = _annual_counts(events, thresh).reindex(year_range, fill_value=0)
|
|||
|
|
annual_all[thresh] = annual
|
|||
|
|
ax_count.plot(annual.index, annual.values, color=color,
|
|||
|
|
label=f"M ≥ {thresh:.1f}", linewidth=1.5 if thresh != 4.5 else 2.5)
|
|||
|
|
stab = _stability_year(annual)
|
|||
|
|
marker = f"stable from ~{stab}" if stab else "no clear stabilisation"
|
|||
|
|
summary_lines.append(f"M ≥ {thresh:.1f}: {marker}")
|
|||
|
|
|
|||
|
|
ax_count.set_ylabel("Events per year")
|
|||
|
|
ax_count.set_title("USGS Earthquake Catalogue — Annual Event Counts by Magnitude Threshold")
|
|||
|
|
ax_count.legend(loc="upper left")
|
|||
|
|
ax_count.grid(True, alpha=0.3)
|
|||
|
|
ax_count.set_xlim(args.start_year, args.end_year)
|
|||
|
|
|
|||
|
|
# Panel 2: ratio M≥4.5 / M≥5.0 — should be roughly constant when both complete
|
|||
|
|
r45 = annual_all[4.5]
|
|||
|
|
r50 = annual_all[5.0]
|
|||
|
|
ratio = (r45 / r50.replace(0, np.nan)).replace([np.inf, -np.inf], np.nan)
|
|||
|
|
ax_ratio.plot(ratio.index, ratio.values, color="black", linewidth=1.0)
|
|||
|
|
ax_ratio.axhline(ratio.loc[1990:2019].median(), color="red", linestyle="--",
|
|||
|
|
label=f"Median (1990-2019): {ratio.loc[1990:2019].median():.1f}")
|
|||
|
|
ax_ratio.set_ylabel("Ratio M≥4.5 / M≥5.0")
|
|||
|
|
ax_ratio.set_xlabel("Year")
|
|||
|
|
ax_ratio.legend()
|
|||
|
|
ax_ratio.grid(True, alpha=0.3)
|
|||
|
|
ax_ratio.set_xlim(args.start_year, args.end_year)
|
|||
|
|
|
|||
|
|
fig.tight_layout()
|
|||
|
|
plot_path = args.output_dir / "catalog_completeness.png"
|
|||
|
|
fig.savefig(plot_path, dpi=150, bbox_inches="tight")
|
|||
|
|
plt.close(fig)
|
|||
|
|
print(f"Saved: {plot_path}")
|
|||
|
|
|
|||
|
|
# ----------------------------------------------------------------
|
|||
|
|
# Figure 2: Gutenberg-Richter frequency-magnitude diagram
|
|||
|
|
# ----------------------------------------------------------------
|
|||
|
|
fig2, ax2 = plt.subplots(figsize=(8, 6))
|
|||
|
|
mag_bins = np.arange(4.0, 9.5, 0.2)
|
|||
|
|
counts_ge = [(events["mag"] >= m).sum() for m in mag_bins]
|
|||
|
|
ax2.semilogy(mag_bins, counts_ge, "ko", markersize=4, label="Observed N(≥M)")
|
|||
|
|
|
|||
|
|
b, a = _gutenberg_richter(events)
|
|||
|
|
if not np.isnan(b):
|
|||
|
|
m_fit = np.linspace(4.5, 9.0, 100)
|
|||
|
|
n_fit = 10 ** (a - b * m_fit)
|
|||
|
|
ax2.semilogy(m_fit, n_fit, "r-", label=f"G-R fit: b={b:.2f}")
|
|||
|
|
|
|||
|
|
ax2.set_xlabel("Magnitude M")
|
|||
|
|
ax2.set_ylabel("Cumulative count N(≥M)")
|
|||
|
|
ax2.set_title("Gutenberg-Richter Frequency-Magnitude Distribution")
|
|||
|
|
ax2.legend()
|
|||
|
|
ax2.grid(True, which="both", alpha=0.3)
|
|||
|
|
gr_path = args.output_dir / "gutenberg_richter.png"
|
|||
|
|
fig2.savefig(gr_path, dpi=150, bbox_inches="tight")
|
|||
|
|
plt.close(fig2)
|
|||
|
|
print(f"Saved: {gr_path}")
|
|||
|
|
|
|||
|
|
# ----------------------------------------------------------------
|
|||
|
|
# Text summary
|
|||
|
|
# ----------------------------------------------------------------
|
|||
|
|
txt_path = args.output_dir / "catalog_completeness.txt"
|
|||
|
|
lines = [
|
|||
|
|
"USGS Earthquake Catalogue — Completeness Summary",
|
|||
|
|
"=" * 50,
|
|||
|
|
f"Total events loaded: {len(events):,}",
|
|||
|
|
f"Date range: {events.index.min().date()} — {events.index.max().date()}",
|
|||
|
|
f"Gutenberg-Richter b-value (M≥4.5): {b:.3f}" if not np.isnan(b) else "G-R fit: insufficient data",
|
|||
|
|
"",
|
|||
|
|
"Estimated catalogue completeness onset (CV < 15% over 5-year window):",
|
|||
|
|
] + [f" {s}" for s in summary_lines] + [
|
|||
|
|
"",
|
|||
|
|
"NOTE: Use M≥4.5 from ~1976 onwards for global cross-correlation analysis.",
|
|||
|
|
"Earlier data should be treated with caution — counts are systematically low.",
|
|||
|
|
]
|
|||
|
|
txt_path.write_text("\n".join(lines))
|
|||
|
|
print(f"Saved: {txt_path}")
|
|||
|
|
print()
|
|||
|
|
for line in lines:
|
|||
|
|
print(line)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
run(_parse_args())
|