- offload_op callback now implemented (MUL_MAT/MUL_MAT_ID) - Memory raised to 10 GiB - Direct compute mode bypasses broken dspqueue on this board - Q8_0 1B model: 115 t/s prompt (4.3x vs CPU 27 t/s) - Generation 9.6 t/s (27% slower than CPU, expected) - dspqueue path fails with error 0x0000002e - llama-cli renamed to llama-simple in current build - Updated scripts for direct-compute mode - Docs updated with new findings and instructions
42 lines
1.1 KiB
Bash
Executable file
42 lines
1.1 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# test-on-q6a.sh — Run llama-cli inference test on Q6A with Hexagon backend (direct compute)
|
|
set -euo pipefail
|
|
|
|
Q6A="${Q6A:-radxa@192.168.1.11}"
|
|
MODEL="${MODEL:-/home/radxa/models/llama-1b-q8_0.gguf}"
|
|
DEPLOY_DIR="${DEPLOY_DIR:-llama/bin}"
|
|
N_TOKENS="${N_TOKENS:-64}"
|
|
PROMPT="${PROMPT:-Hello, what is your name?}"
|
|
|
|
echo "=== Running NPU inference test on Q6A (direct compute) ==="
|
|
echo "Model: ${MODEL}"
|
|
echo "Tokens: ${N_TOKENS}"
|
|
echo ""
|
|
|
|
ssh "${Q6A}" "
|
|
cd ~/${DEPLOY_DIR}
|
|
echo '--- llama-cli version ---'
|
|
./llama-cli --version 2>&1 || true
|
|
echo ''
|
|
echo '--- Running inference with direct compute ---'
|
|
GGML_HEXAGON_DIRECT_COMPUTE=1 ./llama-cli \
|
|
-m '${MODEL}' \
|
|
-n '${N_TOKENS}' \
|
|
-p '${PROMPT}' \
|
|
--no-display-prompt \
|
|
-c 2048 \
|
|
2>&1
|
|
echo ''
|
|
echo '--- exit: ' \$? '---'
|
|
echo ''
|
|
echo '--- CPU-only baseline ---'
|
|
GGML_HEXAGON_NDEV=0 ./llama-cli \
|
|
-m '${MODEL}' \
|
|
-n '${N_TOKENS}' \
|
|
-p '${PROMPT}' \
|
|
--no-display-prompt \
|
|
-c 2048 \
|
|
2>&1
|
|
echo ''
|
|
echo '--- exit: ' \$? '---'
|
|
" 2>&1
|