Adds: - Detailed explanation of why Hexagon NPU doesn't accelerate inference - offload_op callback is NULL in ggml-hexagon.cpp - 2048 MiB limit is hardcoded, not hardware-queried - Q4_K_M not supported by HTP kernels (only Q4_0, Q8_0, IQ4_NL, MXFP4) - Full benchmark table: 1B and 7B models, 2K/32K/64K context, CPU vs NPU - All results show CPU and NPU identical within margin of error - 7B test script (test-7b.sh) - Updated deploy script with password handling for DSP .so - Performance baseline in AGENTS.md - Cross-compile pitfalls (CMAKE_SYSROOT, rpcmem_init)
27 lines
1.3 KiB
Bash
27 lines
1.3 KiB
Bash
#!/usr/bin/env bash
|
|
# test-7b.sh — Run 7B model benchmarks on Q6A at various context sizes
|
|
set -euo pipefail
|
|
|
|
Q6A="${Q6A:-radxa@192.168.1.11}"
|
|
MODEL="${MODEL:-/home/radxa/models/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf}"
|
|
DEPLOY_DIR="${DEPLOY_DIR:-llama/bin}"
|
|
|
|
CONTEXTS=("2048" "8192" "32768" "65536")
|
|
|
|
echo "=== 7B Model Benchmarks ==="
|
|
echo "Model: ${MODEL}"
|
|
echo ""
|
|
|
|
for ctx in "${CONTEXTS[@]}"; do
|
|
echo "--- Context ${ctx} (NPU) ---"
|
|
ssh "${Q6A}" "cd ~/${DEPLOY_DIR} && GGML_HEXAGON=1 LD_LIBRARY_PATH=. timeout 120 ./llama-cli -m '${MODEL}' -n 8 -p Hello -ngl 0 -c ${ctx} --no-display-prompt 2>&1" | grep -E 'Prompt:|Generation:|memory'
|
|
|
|
echo ""
|
|
|
|
echo "--- Context ${ctx} (CPU-only) ---"
|
|
ssh "${Q6A}" "cd ~/${DEPLOY_DIR} && mv libggml-hexagon.so libggml-hexagon.so.disabled 2>/dev/null; mv libggml-hexagon.so.0 libggml-hexagon.so.0.disabled 2>/dev/null; mv libggml-hexagon.so.0.9.11 libggml-hexagon.so.0.9.11.disabled 2>/dev/null; LD_LIBRARY_PATH=. timeout 120 ./llama-cli -m '${MODEL}' -n 8 -p Hello -ngl 0 -c ${ctx} --no-display-prompt 2>&1 | grep -E 'Prompt:|Generation:'; mv libggml-hexagon.so.disabled libggml-hexagon.so 2>/dev/null; mv libggml-hexagon.so.0.disabled libggml-hexagon.so.0 2>/dev/null; mv libggml-hexagon.so.0.9.11.disabled libggml-hexagon.so.0.9.11 2>/dev/null"
|
|
|
|
echo ""
|
|
done
|
|
|
|
echo "=== Done ==="
|