#!/usr/bin/env python3 """ scripts/benchmark_gpu.py ~~~~~~~~~~~~~~~~~~~~~~~~~ Benchmark CPU (joblib, 28 cores) vs GPU (CuPy) surrogate generation. Test signal: N=16,000-point AR(1) with ρ=0.85 (pathologically bad FFT size for CPU — 16,000 = 2^7 × 5^4, actually a reasonable FFT size, chosen to be larger than the real 3,215-point series but still manageable on GPU). n_surrogates = 10,000 (configurable via --n-surrogates) Validates numerical equivalence: CPU vs GPU surrogate p-values must agree to within ±2/sqrt(n_surrogates) Monte Carlo error. Outputs ------- results/benchmark_gpu.txt — timing table and equivalence check """ from __future__ import annotations import argparse import sys import time from pathlib import Path import numpy as np PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT / "src")) from crq.stats.surrogates import surrogate_xcorr_test from crq.stats.surrogates_gpu import ( gpu_available, surrogate_xcorr_test_gpu, auto_batch_size, _GPU_REASON, ) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def make_ar1(N: int, rho: float = 0.85, seed: int = 0) -> tuple[np.ndarray, np.ndarray]: rng = np.random.default_rng(seed) x = np.empty(N, dtype=np.float32) y = np.empty(N, dtype=np.float32) x[0] = y[0] = rng.standard_normal() noise = rng.standard_normal((2, N)).astype(np.float32) for i in range(1, N): x[i] = rho * x[i-1] + np.sqrt(1 - rho**2) * noise[0, i] y[i] = rho * y[i-1] + np.sqrt(1 - rho**2) * noise[1, i] return x, y def _fmt_time(seconds: float) -> str: if seconds < 60: return f"{seconds:.1f}s" return f"{seconds/60:.1f}min" def _print_banner(title: str) -> None: print() print("=" * 70) print(f" {title}") print("=" * 70) # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def _parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("--n-surrogates", type=int, default=10_000) p.add_argument("--n-timesteps", type=int, default=16_000) p.add_argument("--method", default="both", choices=["phase", "iaaft", "both"]) p.add_argument("--iaaft-iter", type=int, default=100) p.add_argument("--n-jobs", type=int, default=-1, help="CPU joblib workers") p.add_argument("--seed", type=int, default=42) p.add_argument("--lag-range", type=int, default=200, help="±lag range in bins") p.add_argument("--vram-gb", type=float, default=10.0) p.add_argument("--output-dir", type=Path, default=PROJECT_ROOT / "results") return p.parse_args() # --------------------------------------------------------------------------- # Run one method, CPU then GPU, return timing + equivalence result # --------------------------------------------------------------------------- def run_comparison( x: np.ndarray, y: np.ndarray, lag_bins: np.ndarray, method: str, n_surrogates: int, n_jobs: int, iaaft_iter: int, seed: int, vram_gb: float, ) -> dict: _print_banner(f"Method: {method.upper()} | N={len(x)} | n_surr={n_surrogates:,}") result = {} # ------------------------------------------------------------------ # CPU benchmark # ------------------------------------------------------------------ print(f"\n[CPU] joblib n_jobs={n_jobs}, method={method}") t0 = time.perf_counter() cpu_out = surrogate_xcorr_test( x, y, lag_bins, n_surrogates=n_surrogates, method=method, seed=seed, n_jobs=n_jobs, iaaft_n_iter=iaaft_iter, ) cpu_time = time.perf_counter() - t0 print(f" p_global = {cpu_out['p_global']:.4f} | time = {_fmt_time(cpu_time)}") result["cpu_time_s"] = cpu_time result["cpu_p_global"] = cpu_out["p_global"] # ------------------------------------------------------------------ # GPU benchmark # ------------------------------------------------------------------ if not gpu_available(): print(f"\n[GPU] UNAVAILABLE — {_GPU_REASON}") result["gpu_time_s"] = None result["gpu_p_global"] = None result["speedup"] = None result["equiv_ok"] = None return result T = len(x) batch = auto_batch_size(T, vram_budget_gb=vram_gb, method=method) print(f"\n[GPU] device={_GPU_REASON} batch_size={batch:,}") # Warm-up pass (avoids counting CUDA JIT in timing) _ = surrogate_xcorr_test_gpu( x, y, lag_bins[:10], n_surrogates=32, method=method, seed=seed, iaaft_n_iter=min(iaaft_iter, 5), vram_budget_gb=vram_gb, ) t0 = time.perf_counter() gpu_out = surrogate_xcorr_test_gpu( x, y, lag_bins, n_surrogates=n_surrogates, method=method, seed=seed, iaaft_n_iter=iaaft_iter, vram_budget_gb=vram_gb, ) gpu_time = time.perf_counter() - t0 speedup = cpu_time / gpu_time print(f" p_global = {gpu_out['p_global']:.4f} | time = {_fmt_time(gpu_time)} | speedup = {speedup:.1f}×") # ------------------------------------------------------------------ # Numerical equivalence check # ±2/sqrt(n_surrogates) Monte Carlo tolerance on p_global # ------------------------------------------------------------------ mc_tol = 2.0 / np.sqrt(n_surrogates) delta = abs(cpu_out["p_global"] - gpu_out["p_global"]) equiv_ok = delta <= mc_tol status = "PASS ✓" if equiv_ok else "FAIL ✗" print(f"\n Equivalence check: |Δp_global| = {delta:.4f} (tolerance {mc_tol:.4f}) → {status}") # Also check peak lag agreement cpu_peak = cpu_out["observed_peak_lag"] gpu_peak = gpu_out["observed_peak_lag"] if cpu_peak == gpu_peak: print(f" Peak lag: CPU={cpu_peak} GPU={gpu_peak} → AGREE ✓") else: print(f" Peak lag: CPU={cpu_peak} GPU={gpu_peak} → DIFFER (check seed handling)") result["gpu_time_s"] = gpu_time result["gpu_p_global"] = gpu_out["p_global"] result["speedup"] = speedup result["equiv_ok"] = equiv_ok result["mc_tol"] = mc_tol result["delta_p"] = delta return result # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def run(args: argparse.Namespace) -> None: args.output_dir.mkdir(parents=True, exist_ok=True) print(f"\nBuilding AR(1) series: N={args.n_timesteps:,} ρ=0.85 seed={args.seed}") x, y = make_ar1(args.n_timesteps, seed=args.seed) lag_bins = np.arange(-args.lag_range, args.lag_range + 1, dtype=int) if gpu_available(): print(f"GPU: {_GPU_REASON}") for m in (["phase", "iaaft"] if args.method == "both" else [args.method]): batch = auto_batch_size(args.n_timesteps, vram_budget_gb=args.vram_gb, method=m) print(f"Auto batch size for T={args.n_timesteps} ({m}): {batch:,} surrogates/pass") else: print(f"GPU: UNAVAILABLE — {_GPU_REASON} (running CPU-only benchmark)") methods = ["phase", "iaaft"] if args.method == "both" else [args.method] all_results = {} for m in methods: r = run_comparison( x, y, lag_bins, method=m, n_surrogates=args.n_surrogates, n_jobs=args.n_jobs, iaaft_iter=args.iaaft_iter, seed=args.seed, vram_gb=args.vram_gb, ) all_results[m] = r # ------------------------------------------------------------------ # Summary table # ------------------------------------------------------------------ _print_banner("Summary") print(f"\n{'Method':<8} {'CPU (s)':>10} {'GPU (s)':>10} {'Speedup':>9} {'Equiv':>7}") print("-" * 48) for m, r in all_results.items(): gpu_s = f"{r['gpu_time_s']:.1f}" if r["gpu_time_s"] is not None else "N/A" speedup = f"{r['speedup']:.1f}×" if r["speedup"] is not None else "N/A" equiv = ("PASS" if r["equiv_ok"] else "FAIL") if r["equiv_ok"] is not None else "N/A" print(f"{m:<8} {r['cpu_time_s']:>10.1f} {gpu_s:>10} {speedup:>9} {equiv:>7}") # ------------------------------------------------------------------ # Save text report # ------------------------------------------------------------------ lines = [ "GPU vs CPU Surrogate Benchmark", "=" * 50, f"N timesteps: {args.n_timesteps:,}", f"N surrogates: {args.n_surrogates:,}", f"IAAFT iter: {args.iaaft_iter}", f"CPU n_jobs: {args.n_jobs}", f"GPU device: {_GPU_REASON}", "", f"{'Method':<8} {'CPU (s)':>10} {'GPU (s)':>10} {'Speedup':>9} {'Equiv':>7}", "-" * 48, ] for m, r in all_results.items(): gpu_s = f"{r['gpu_time_s']:.1f}" if r["gpu_time_s"] is not None else "N/A" speedup = f"{r['speedup']:.1f}x" if r["speedup"] is not None else "N/A" equiv = ("PASS" if r["equiv_ok"] else "FAIL") if r["equiv_ok"] is not None else "N/A" lines.append(f"{m:<8} {r['cpu_time_s']:>10.1f} {gpu_s:>10} {speedup:>9} {equiv:>7}") out_path = args.output_dir / "benchmark_gpu.txt" out_path.write_text("\n".join(lines)) print(f"\nReport saved: {out_path}") if __name__ == "__main__": run(_parse_args())