cosmicraysandearthquakes/scripts/01_download_data.py
root e5a812fa14 Initial commit: full analysis pipeline source code
Scripts 01-08 implement the complete cosmic-ray/earthquake correlation
analysis from data ingestion through out-of-sample validation and
combined timeseries sinusoid fitting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 02:45:10 +02:00

133 lines
4.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
scripts/01_download_data.py
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Idempotent data downloader for the CRQ project.
Downloads:
1. NMDB hourly pressure-corrected neutron counts (all stations, year-by-year)
2. USGS earthquake catalogue (M≥4.5, year-by-year)
3. SIDC daily sunspot numbers
Files are skipped if they already exist and are non-empty. Failed downloads
are logged and skipped (the script continues rather than aborting).
Usage
-----
# Full run (1960-2019, all 44 stations) — takes several hours on first run
python scripts/01_download_data.py
# Quick subset: 2 stations, 2 years — useful for CI / smoke-testing
python scripts/01_download_data.py --subset
# Custom range
python scripts/01_download_data.py --start-year 2000 --end-year 2002 --stations OULU THUL
"""
from __future__ import annotations
import argparse
import logging
import sys
from pathlib import Path
import yaml
# Ensure project src is importable when run as a script
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))
from crq.ingest import nmdb as _nmdb
from crq.ingest import usgs as _usgs
from crq.ingest import sidc as _sidc
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(name)s%(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
logger = logging.getLogger("crq.download")
def _load_station_names(config_path: Path) -> list[str]:
with config_path.open() as fh:
cfg = yaml.safe_load(fh)
return list(cfg["stations"].keys())
def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
p.add_argument("--subset", action="store_true",
help="Quick 2-station / 2-year smoke-test (OULU + THUL, 2018-2019)")
p.add_argument("--start-year", type=int, default=1960)
p.add_argument("--end-year", type=int, default=2019)
p.add_argument("--stations", nargs="+", metavar="CODE",
help="Override station list (default: all from config/stations.yaml)")
p.add_argument("--min-magnitude", type=float, default=4.5)
p.add_argument("--data-dir", type=Path, default=PROJECT_ROOT / "data" / "raw")
p.add_argument("--config", type=Path, default=PROJECT_ROOT / "config" / "stations.yaml")
return p.parse_args()
def main() -> None:
args = _parse_args()
if args.subset:
stations = ["OULU", "THUL"]
start_year, end_year = 2018, 2019
logger.info("SUBSET mode: stations=%s years=%d-%d", stations, start_year, end_year)
else:
stations = args.stations or _load_station_names(args.config)
start_year, end_year = args.start_year, args.end_year
nmdb_dir = args.data_dir / "nmdb"
usgs_dir = args.data_dir / "usgs"
sidc_dir = args.data_dir / "sidc"
# ------------------------------------------------------------------
# 1. NMDB
# ------------------------------------------------------------------
logger.info("=== NMDB download: %d stations × %d years ===",
len(stations), end_year - start_year + 1)
nmdb_ok = nmdb_err = 0
for station in stations:
for year in range(start_year, end_year + 1):
try:
_nmdb.download_station_year(station, year, nmdb_dir)
nmdb_ok += 1
except Exception as exc:
logger.error("NMDB %s %d: %s", station, year, exc)
nmdb_err += 1
logger.info("station %s done", station)
logger.info("NMDB: %d ok, %d errors", nmdb_ok, nmdb_err)
# ------------------------------------------------------------------
# 2. USGS
# ------------------------------------------------------------------
logger.info("=== USGS download: years %d-%d, M≥%.1f ===",
start_year, end_year, args.min_magnitude)
usgs_ok = usgs_err = 0
for year in range(start_year, end_year + 1):
try:
_usgs.download_year(year, usgs_dir, min_magnitude=args.min_magnitude)
usgs_ok += 1
except Exception as exc:
logger.error("USGS %d: %s", year, exc)
usgs_err += 1
logger.info("USGS: %d ok, %d errors", usgs_ok, usgs_err)
# ------------------------------------------------------------------
# 3. SIDC sunspots
# ------------------------------------------------------------------
logger.info("=== SIDC sunspot download ===")
try:
_sidc.download_sunspots(sidc_dir)
logger.info("sunspots ok")
except Exception as exc:
logger.error("sunspots: %s", exc)
logger.info("Download complete.")
if __name__ == "__main__":
main()