cosmicraysandearthquakes/tests/test_nmdb.py
root e5a812fa14 Initial commit: full analysis pipeline source code
Scripts 01-08 implement the complete cosmic-ray/earthquake correlation
analysis from data ingestion through out-of-sample validation and
combined timeseries sinusoid fitting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 02:45:10 +02:00

178 lines
7.2 KiB
Python

"""Tests for src/crq/ingest/nmdb.py"""
from __future__ import annotations
import textwrap
from pathlib import Path
import numpy as np
import pandas as pd
import pytest
from crq.ingest.nmdb import (
compute_daily_coverage,
log_avg,
parse_nmdb_csv,
resample_daily,
)
# ---------------------------------------------------------------------------
# log_avg
# ---------------------------------------------------------------------------
class TestLogAvg:
def test_single_value(self) -> None:
"""log_avg of a single value x should equal x."""
assert log_avg(np.array([40.0])) == pytest.approx(40.0, rel=1e-9)
def test_symmetric_pair(self) -> None:
"""log_avg([10, 10]) == 10."""
assert log_avg(np.array([10.0, 10.0])) == pytest.approx(10.0, rel=1e-9)
def test_known_value(self) -> None:
"""log_avg([20, 30]) = 10*log10(mean(10^2, 10^3)) = 10*log10(550) ≈ 27.404."""
expected = 10.0 * np.log10(np.mean([10**2, 10**3]))
assert log_avg(np.array([20.0, 30.0])) == pytest.approx(expected, rel=1e-9)
def test_nan_ignored(self) -> None:
"""NaN values should be excluded from the average."""
result = log_avg(np.array([10.0, np.nan, 10.0]))
assert result == pytest.approx(10.0, rel=1e-9)
def test_all_nan_returns_nan(self) -> None:
assert np.isnan(log_avg(np.array([np.nan, np.nan])))
def test_pandas_series(self) -> None:
"""Accept a pd.Series as input."""
s = pd.Series([10.0, 10.0])
assert log_avg(s) == pytest.approx(10.0, rel=1e-9)
# ---------------------------------------------------------------------------
# parse_nmdb_csv
# ---------------------------------------------------------------------------
class TestParseNmdbCsv:
def test_returns_dataframe(self, nmdb_csv_file: Path) -> None:
df = parse_nmdb_csv(nmdb_csv_file, "OULU")
assert isinstance(df, pd.DataFrame)
assert "OULU" in df.columns
def test_index_is_datetime(self, nmdb_csv_file: Path) -> None:
df = parse_nmdb_csv(nmdb_csv_file, "OULU")
assert isinstance(df.index, pd.DatetimeIndex)
def test_row_count(self, nmdb_csv_file: Path) -> None:
df = parse_nmdb_csv(nmdb_csv_file, "OULU")
assert len(df) == 24
def test_values_are_positive_floats(self, nmdb_csv_file: Path) -> None:
df = parse_nmdb_csv(nmdb_csv_file, "OULU")
assert (df["OULU"] > 0).all()
def test_sentinel_zero_becomes_nan(self, tmp_path: Path) -> None:
"""Zero-valued rows (sentinel for missing) must become NaN."""
csv = tmp_path / "OULU2019.csv"
csv.write_text(
"start_date_time;OULU\n"
"2019-01-01 00:59:00;0.0\n"
"2019-01-01 01:59:00;6850.0\n"
)
df = parse_nmdb_csv(csv, "OULU")
assert np.isnan(df["OULU"].iloc[0])
assert df["OULU"].iloc[1] == pytest.approx(6850.0)
def test_empty_file_returns_empty_df(self, tmp_path: Path) -> None:
csv = tmp_path / "empty.csv"
csv.write_text("")
df = parse_nmdb_csv(csv, "OULU")
assert df.empty
def test_duplicate_timestamps_dropped(self, tmp_path: Path) -> None:
csv = tmp_path / "dup.csv"
csv.write_text(
"start_date_time;OULU\n"
"2019-01-01 00:59:00;6850.0\n"
"2019-01-01 00:59:00;6900.0\n" # duplicate
)
df = parse_nmdb_csv(csv, "OULU")
assert len(df) == 1
assert df["OULU"].iloc[0] == pytest.approx(6850.0)
# ---------------------------------------------------------------------------
# compute_daily_coverage
# ---------------------------------------------------------------------------
class TestDailyCoverage:
def test_full_day_coverage_is_one(self, nmdb_csv_file: Path) -> None:
df = parse_nmdb_csv(nmdb_csv_file, "OULU")
cov = compute_daily_coverage(df["OULU"])
assert float(cov.iloc[0]) == pytest.approx(1.0)
def test_half_day_coverage(self, nmdb_csv_partial: Path) -> None:
df = parse_nmdb_csv(nmdb_csv_partial, "OULU")
cov = compute_daily_coverage(df["OULU"])
assert float(cov.iloc[0]) == pytest.approx(0.5)
def test_coverage_range(self, nmdb_csv_file: Path) -> None:
df = parse_nmdb_csv(nmdb_csv_file, "OULU")
cov = compute_daily_coverage(df["OULU"])
assert (cov >= 0).all() and (cov <= 1).all()
def test_all_nan_gives_zero_coverage(self) -> None:
idx = pd.date_range("2019-01-01", periods=24, freq="h")
s = pd.Series([np.nan] * 24, index=idx)
cov = compute_daily_coverage(s)
assert float(cov.iloc[0]) == pytest.approx(0.0)
# ---------------------------------------------------------------------------
# resample_daily — coverage threshold + NaN propagation
# ---------------------------------------------------------------------------
class TestResampleDaily:
def test_full_day_is_not_flagged(self, nmdb_csv_file: Path) -> None:
hourly = parse_nmdb_csv(nmdb_csv_file, "OULU")
daily = resample_daily(hourly, "OULU")
assert not daily["OULU_flagged"].iloc[0]
assert not np.isnan(daily["OULU"].iloc[0])
def test_partial_day_below_threshold_is_nan(self, nmdb_csv_partial: Path) -> None:
"""12/24 hours = 0.5 coverage < default 0.60 threshold → NaN."""
hourly = parse_nmdb_csv(nmdb_csv_partial, "OULU")
daily = resample_daily(hourly, "OULU", coverage_threshold=0.60)
assert daily["OULU_flagged"].iloc[0]
assert np.isnan(daily["OULU"].iloc[0])
def test_partial_day_above_threshold_is_kept(self, nmdb_csv_partial: Path) -> None:
"""12/24 = 0.5 coverage — kept when threshold is 0.40."""
hourly = parse_nmdb_csv(nmdb_csv_partial, "OULU")
daily = resample_daily(hourly, "OULU", coverage_threshold=0.40)
assert not daily["OULU_flagged"].iloc[0]
assert not np.isnan(daily["OULU"].iloc[0])
def test_two_days_mixed_coverage(self, nmdb_csv_two_days: Path) -> None:
"""Day 1 = 24 hrs (ok), Day 2 = 10 hrs (flagged at 0.60 threshold)."""
hourly = parse_nmdb_csv(nmdb_csv_two_days, "OULU")
daily = resample_daily(hourly, "OULU", coverage_threshold=0.60)
assert not daily["OULU_flagged"].iloc[0], "Day 1 should not be flagged"
assert daily["OULU_flagged"].iloc[1], "Day 2 should be flagged"
assert not np.isnan(daily["OULU"].iloc[0])
assert np.isnan(daily["OULU"].iloc[1])
def test_output_columns(self, nmdb_csv_file: Path) -> None:
hourly = parse_nmdb_csv(nmdb_csv_file, "OULU")
daily = resample_daily(hourly, "OULU")
assert set(daily.columns) == {"OULU", "OULU_coverage", "OULU_flagged"}
def test_no_fillna_zero_bias(self) -> None:
"""Regression: NaN gaps must NOT be replaced with 0 before averaging."""
idx = pd.date_range("2019-01-01", periods=24, freq="h")
vals = [6850.0] * 12 + [np.nan] * 12
hourly = pd.DataFrame({"OULU": vals}, index=idx)
daily = resample_daily(hourly, "OULU", coverage_threshold=0.0)
# Mean of the 12 valid values — must NOT be dragged down toward zero
expected_mean = np.mean([6850.0] * 12)
assert daily["OULU"].iloc[0] == pytest.approx(expected_mean, rel=1e-6)