cosmicraysandearthquakes/scripts/06_check_data_availability.py

#!/usr/bin/env python3
"""
scripts/06_check_data_availability.py
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Determine the most recent date on which all three data sources are reliably
available, then download missing data for the 2020-to-present window.

Data sources
------------
1. NMDB  — hourly pressure-corrected neutron monitor counts.
           Reliable end = last date with ≥ 60% hourly coverage, minus 30 days
           to allow for processing delays.  Flags stations with > 30-day gaps.
2. USGS  — M ≥ 4.5 global earthquake catalogue via FDSN.
           Catalogue is generally complete within ~30 days.
           Reliable end = today − 45 days.
3. SIDC  — SILSO daily sunspot numbers.
           Definitive values: ~6-month lag.  Provisional: ~30-day lag.
           Reliable end (definitive) = today − 180 days.
           This script uses the provisional series with a note, so
           reliable end = today − 30 days.

Common window end = min(NMDB_reliable, USGS_reliable, SIDC_reliable).
Window start is fixed at 2020-01-01 (first date post-Homola study period).

Outputs
-------
results/data_availability.json   — window dates + per-source details
results/data_availability.txt    — human-readable report

Usage
-----
python scripts/06_check_data_availability.py
python scripts/06_check_data_availability.py --no-download   # check only
"""

from __future__ import annotations

import argparse
import json
import logging
import sys
import time
from datetime import date, timedelta
from pathlib import Path

import numpy as np
import pandas as pd
import requests
import yaml

PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))

from crq.ingest.nmdb import load_station, resample_daily, download_station_year
from crq.ingest.usgs import download_year as usgs_download_year, load_usgs

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)-8s %(name)s — %(message)s",
    datefmt="%Y-%m-%dT%H:%M:%S",
)
logger = logging.getLogger("crq.avail")

OOS_START = "2020-01-01"
COVERAGE_THRESHOLD = 0.60
MIN_COVERAGE_FRACTION = 0.50   # station needs 50% valid bins in OOS window
GAP_WARN_DAYS = 30


# ---------------------------------------------------------------------------
# SIDC download
# ---------------------------------------------------------------------------

_SIDC_URL = (
    "https://www.sidc.be/silso/INFO/sndhcsv.php"
)
_SIDC_URL_ALT = (
    "https://www.sidc.be/silso/DATA/SN_d_tot_V2.0.csv"
)


def download_sidc(sidc_dir: Path, timeout: int = 60) -> Path | None:
    """Download SIDC daily total sunspot number (V2.0). Returns path or None."""
    sidc_dir.mkdir(parents=True, exist_ok=True)
    dest = sidc_dir / "sunspots.csv"

    for url in (_SIDC_URL, _SIDC_URL_ALT):
        try:
            resp = requests.get(url, timeout=timeout)
            resp.raise_for_status()
            dest.write_text(resp.text, encoding="utf-8")
            logger.info("SIDC downloaded: %d bytes → %s", len(resp.text), dest)
            return dest
        except Exception as exc:
            logger.warning("SIDC download failed (%s): %s", url, exc)

    # Fall back to any existing file
    if dest.exists() and dest.stat().st_size > 0:
        logger.warning("SIDC download failed — using cached file %s", dest)
        return dest
    return None


def check_sidc(sidc_dir: Path, today: date) -> dict:
    """Parse SIDC file and determine reliable end date."""
    path = sidc_dir / "sunspots.csv"
    if not path.exists():
        return {"status": "missing", "last_date": None, "reliable_end": None}

    try:
        # SIDC V2.0 CSV: year;month;day;fracyear;SSN;std;Nobs;provisional
        df = pd.read_csv(
            path,
            sep=";",
            header=None,
            names=["year", "month", "day", "fracyear", "ssn", "std", "nobs", "prov"],
            comment="#",
            dtype=str,
        )
        df = df[df["year"].str.isnumeric()]
        df["date"] = pd.to_datetime(
            df["year"].str.strip() + "-" + df["month"].str.strip() + "-" + df["day"].str.strip(),
            errors="coerce",
        )
        df = df.dropna(subset=["date"])
        df["ssn"] = pd.to_numeric(df["ssn"].str.strip(), errors="coerce")
        df["prov"] = pd.to_numeric(df["prov"].str.strip(), errors="coerce").fillna(1).astype(int)

        last_date    = df["date"].max().date()
        # Provisional records (prov=1) may be revised; definitive = prov=0
        definitive   = df[df["prov"] == 0]["date"].max().date() if (df["prov"] == 0).any() else None
        # Reliable end: today minus 30 days (provisional is good enough)
        reliable_end = today - timedelta(days=30)

        return {
            "status":          "ok",
            "last_date":       str(last_date),
            "definitive_end":  str(definitive) if definitive else None,
            "reliable_end":    str(min(reliable_end, last_date)),
            "n_records":       len(df),
            "note":            "Using provisional values (prov=1); definitive lag ~6 months",
        }
    except Exception as exc:
        logger.warning("SIDC parse error: %s", exc)
        return {"status": "parse_error", "error": str(exc), "reliable_end": None}


# ---------------------------------------------------------------------------
# NMDB download + check
# ---------------------------------------------------------------------------

def download_nmdb_oos(
    station_ids: list[str],
    nmdb_dir: Path,
    oos_start_year: int,
    oos_end_year: int,
    sleep_between: float = 0.3,
) -> dict[str, list[int]]:
    """
    Download all station-years in [oos_start_year, oos_end_year] that are
    missing from nmdb_dir.  Returns dict station_id -> list of years downloaded.
    """
    downloaded: dict[str, list[int]] = {s: [] for s in station_ids}
    for station in station_ids:
        for year in range(oos_start_year, oos_end_year + 1):
            dest = nmdb_dir / f"{station}{year}.csv"
            if dest.exists() and dest.stat().st_size > 0:
                logger.debug("skip %s %d (exists)", station, year)
                continue
            try:
                download_station_year(station, year, nmdb_dir)
                downloaded[station].append(year)
                time.sleep(sleep_between)
            except Exception as exc:
                logger.warning("NMDB %s %d: %s", station, year, exc)
    return downloaded


def check_nmdb_stations(
    station_ids: list[str],
    nmdb_dir: Path,
    oos_start: str,
    today: date,
) -> dict[str, dict]:
    """
    For each station, determine coverage fraction in OOS window and
    the most recent date with data.
    """
    oos_start_ts = pd.Timestamp(oos_start)
    oos_end_ts   = pd.Timestamp(today.isoformat())
    start_year   = int(oos_start[:4])
    end_year     = today.year

    station_info = {}
    for station in station_ids:
        hourly = load_station(station, start_year, end_year, nmdb_dir)
        if hourly.empty:
            station_info[station] = {
                "status": "no_data",
                "coverage_oos": 0.0,
                "last_date": None,
                "gap_days": None,
            }
            continue

        hourly_oos = hourly.loc[oos_start:]
        if hourly_oos.empty:
            station_info[station] = {
                "status": "no_oos_data",
                "coverage_oos": 0.0,
                "last_date": None,
                "gap_days": None,
            }
            continue

        daily_df  = resample_daily(hourly_oos, station, coverage_threshold=COVERAGE_THRESHOLD)
        daily     = daily_df[station]
        n_total   = (oos_end_ts - oos_start_ts).days + 1
        n_valid   = int(daily.notna().sum())
        coverage  = n_valid / n_total

        last_valid = daily.dropna().index.max().date() if not daily.dropna().empty else None
        gap_days   = (today - last_valid).days if last_valid else None

        station_info[station] = {
            "status":       "ok" if coverage >= MIN_COVERAGE_FRACTION else "low_coverage",
            "coverage_oos": round(coverage, 4),
            "last_date":    str(last_valid) if last_valid else None,
            "gap_days":     gap_days,
            "flag_gap":     gap_days > GAP_WARN_DAYS if gap_days is not None else True,
        }
        logger.info(
            "NMDB %-6s  coverage=%.1f%%  last=%s  gap=%s d",
            station, 100 * coverage,
            last_valid or "N/A",
            gap_days or "N/A",
        )

    return station_info


def nmdb_reliable_end(station_info: dict[str, dict], today: date) -> date:
    """
    NMDB reliable end: median last_date among stations with good coverage,
    minus 30 days.
    """
    last_dates = []
    for info in station_info.values():
        if info.get("coverage_oos", 0) >= MIN_COVERAGE_FRACTION and info.get("last_date"):
            last_dates.append(date.fromisoformat(info["last_date"]))
    if not last_dates:
        return today - timedelta(days=90)
    # Use the 25th percentile to be conservative
    last_dates.sort()
    p25_idx = max(0, len(last_dates) // 4)
    return last_dates[p25_idx] - timedelta(days=30)


# ---------------------------------------------------------------------------
# USGS download + check
# ---------------------------------------------------------------------------

def download_usgs_oos(
    usgs_dir: Path,
    oos_start_year: int,
    oos_end_year: int,
    min_magnitude: float = 4.5,
) -> None:
    """Download missing USGS yearly files for OOS window."""
    for year in range(oos_start_year, oos_end_year + 1):
        dest = usgs_dir / f"usgs-{year}.csv"
        if dest.exists() and dest.stat().st_size > 0:
            logger.debug("USGS %d: skip (exists)", year)
            continue
        try:
            usgs_download_year(year, usgs_dir, min_magnitude=min_magnitude)
            logger.info("USGS %d: downloaded", year)
        except Exception as exc:
            logger.warning("USGS %d: %s", year, exc)


def check_usgs(usgs_dir: Path, today: date, oos_start: str) -> dict:
    """Determine USGS coverage and reliable end date."""
    start_year   = int(oos_start[:4])
    end_year     = today.year
    available    = []
    total_events = 0

    for year in range(start_year, end_year + 1):
        p = usgs_dir / f"usgs-{year}.csv"
        if p.exists() and p.stat().st_size > 0:
            available.append(year)
            try:
                df = pd.read_csv(p, usecols=["time", "mag"])
                total_events += len(df)
            except Exception:
                pass

    reliable_end = today - timedelta(days=45)
    return {
        "status":        "ok" if available else "missing",
        "years_present": available,
        "total_events":  total_events,
        "reliable_end":  str(reliable_end),
        "note":          "Catalogue stability: complete within ~30 days; using today-45 days",
    }


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def _parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
    )
    p.add_argument("--no-download", action="store_true",
                   help="Skip download attempts; check existing files only")
    p.add_argument("--min-mag",     type=float, default=4.5,
                   help="Minimum magnitude for USGS download (default 4.5)")
    p.add_argument("--nmdb-dir",    type=Path, default=PROJECT_ROOT/"data"/"raw"/"nmdb")
    p.add_argument("--usgs-dir",    type=Path, default=PROJECT_ROOT/"data"/"raw"/"usgs")
    p.add_argument("--sidc-dir",    type=Path, default=PROJECT_ROOT/"data"/"raw"/"sidc")
    p.add_argument("--config",      type=Path, default=PROJECT_ROOT/"config"/"stations.yaml")
    p.add_argument("--output-dir",  type=Path, default=PROJECT_ROOT/"results")
    return p.parse_args()


def run(args: argparse.Namespace) -> dict:
    args.output_dir.mkdir(parents=True, exist_ok=True)
    today = date.today()

    with open(args.config) as fh:
        cfg = yaml.safe_load(fh)
    station_ids = list(cfg["stations"].keys())
    oos_start_year = int(OOS_START[:4])

    # ------------------------------------------------------------------ #
    # 1. Download missing data                                             #
    # ------------------------------------------------------------------ #
    if not args.no_download:
        logger.info("Downloading missing NMDB OOS data (%d-%d) …", oos_start_year, today.year)
        download_nmdb_oos(station_ids, args.nmdb_dir, oos_start_year, today.year)

        logger.info("Downloading missing USGS OOS data (%d-%d) …", oos_start_year, today.year)
        download_usgs_oos(args.usgs_dir, oos_start_year, today.year, min_magnitude=args.min_mag)

        logger.info("Downloading SIDC sunspot data …")
        download_sidc(args.sidc_dir)
    else:
        logger.info("--no-download: skipping download, checking existing files only")

    # ------------------------------------------------------------------ #
    # 2. Check each source                                                 #
    # ------------------------------------------------------------------ #
    logger.info("Checking NMDB station coverage …")
    nmdb_info = check_nmdb_stations(station_ids, args.nmdb_dir, OOS_START, today)

    good_stations = [
        sid for sid, info in nmdb_info.items()
        if info.get("coverage_oos", 0) >= MIN_COVERAGE_FRACTION
    ]
    flagged = [
        sid for sid, info in nmdb_info.items()
        if info.get("flag_gap") and info.get("coverage_oos", 0) > 0
    ]

    nmdb_end  = nmdb_reliable_end(nmdb_info, today)
    usgs_info = check_usgs(args.usgs_dir, today, OOS_START)
    sidc_info = check_sidc(args.sidc_dir, today)

    usgs_end  = date.fromisoformat(usgs_info["reliable_end"])
    sidc_end  = date.fromisoformat(sidc_info["reliable_end"]) if sidc_info.get("reliable_end") else today - timedelta(days=90)

    common_end = min(nmdb_end, usgs_end, sidc_end)
    constraining = {
        "NMDB":  nmdb_end,
        "USGS":  usgs_end,
        "SIDC":  sidc_end,
    }
    constrained_by = min(constraining, key=constraining.get)

    # ------------------------------------------------------------------ #
    # 3. Print summary                                                     #
    # ------------------------------------------------------------------ #
    print()
    print("=" * 72)
    print("  OUT-OF-SAMPLE DATA AVAILABILITY")
    print(f"  Run date: {today}")
    print("=" * 72)
    print(f"\n  OOS window start: {OOS_START}")
    print(f"  NMDB reliable end:  {nmdb_end}  ({len(good_stations)} stations ≥{MIN_COVERAGE_FRACTION*100:.0f}% coverage)")
    print(f"  USGS reliable end:  {usgs_end}")
    print(f"  SIDC reliable end:  {sidc_end}")
    print(f"\n  *** Common reliable end: {common_end}  (constrained by {constrained_by}) ***")
    print(f"\n  OOS window: {OOS_START} → {common_end}")
    print(f"  Duration: {(date.fromisoformat(str(common_end)) - date.fromisoformat(OOS_START)).days} days")
    print()
    print(f"  NMDB stations with ≥{MIN_COVERAGE_FRACTION*100:.0f}% OOS coverage ({len(good_stations)}):")
    for sid in sorted(good_stations):
        info = nmdb_info[sid]
        flag = "  *** GAP > 30d ***" if info.get("flag_gap") else ""
        print(f"    {sid:<8} coverage={info['coverage_oos']*100:5.1f}%  last={info['last_date']}{flag}")
    if flagged:
        print(f"\n  Stations with >30-day gap (may be offline): {', '.join(sorted(flagged))}")
    print("=" * 72)
    print()

    # ------------------------------------------------------------------ #
    # 4. Save JSON and text report                                         #
    # ------------------------------------------------------------------ #
    payload = {
        "run_date":        str(today),
        "oos_start":       OOS_START,
        "oos_end":         str(common_end),
        "constrained_by":  constrained_by,
        "nmdb_reliable_end":  str(nmdb_end),
        "usgs_reliable_end":  str(usgs_end),
        "sidc_reliable_end":  str(sidc_end),
        "good_stations_oos":  sorted(good_stations),
        "flagged_stations":   sorted(flagged),
        "nmdb_station_detail": {
            sid: {k: v for k, v in info.items() if k != "flag_gap"}
            for sid, info in nmdb_info.items()
        },
        "usgs_detail": usgs_info,
        "sidc_detail":  sidc_info,
    }
    json_path = args.output_dir / "data_availability.json"
    json_path.write_text(json.dumps(payload, indent=2, default=str), encoding="utf-8")
    logger.info("JSON saved: %s", json_path)

    txt_lines = [
        "OUT-OF-SAMPLE DATA AVAILABILITY REPORT",
        f"Run date: {today}",
        f"OOS window: {OOS_START} → {common_end}  (constrained by {constrained_by})",
        "",
        f"NMDB stations with ≥{MIN_COVERAGE_FRACTION*100:.0f}% OOS coverage: {len(good_stations)}",
        *[
            f"  {sid:<8} coverage={nmdb_info[sid]['coverage_oos']*100:5.1f}%  last={nmdb_info[sid]['last_date']}"
            for sid in sorted(good_stations)
        ],
        "",
        f"USGS: years available = {usgs_info['years_present']}  events = {usgs_info['total_events']:,}",
        f"SIDC: last_date = {sidc_info.get('last_date')}  (provisional note: {sidc_info.get('note', '')})",
    ]
    txt_path = args.output_dir / "data_availability.txt"
    txt_path.write_text("\n".join(txt_lines), encoding="utf-8")
    logger.info("Text report saved: %s", txt_path)

    return payload


if __name__ == "__main__":
    run(_parse_args())