Complete documentation for running llama.cpp with the Qualcomm Hexagon CDSP v68 NPU backend on a Radxa Dragon Q6A (SA8775P) board. Includes: - Corrected FastRPC test harness (libcdsprpc handles INIT_CREATE) - Minimal DSP stub library - Cross-compile build script for llama.cpp - Deploy and test scripts for Q6A - Kernel FastRPC header for reference - Comprehensive README with lessons learned Key findings: - Do NOT call FASTRPC_IOCTL_INIT_CREATE manually - Must link against Q6A system libcdsprpc (not SDK cross-compiled) - Build verified: 32 t/s prompt, 4.5 t/s generation on 1B model
31 lines
848 B
Bash
Executable file
31 lines
848 B
Bash
Executable file
#!/usr/bin/env bash
|
|
# test-on-q6a.sh — Run llama-cli inference test on Q6A with Hexagon backend
|
|
set -euo pipefail
|
|
|
|
Q6A="${Q6A:-radxa@192.168.1.11}"
|
|
MODEL="${MODEL:-/home/radxa/models/llama-3.2-1b-q4km.gguf}"
|
|
DEPLOY_DIR="${DEPLOY_DIR:-llama/bin}"
|
|
N_TOKENS="${N_TOKENS:-32}"
|
|
PROMPT="${PROMPT:-Hello, what is your name?}"
|
|
|
|
echo "=== Running inference test on Q6A ==="
|
|
echo "Model: ${MODEL}"
|
|
echo "Tokens: ${N_TOKENS}"
|
|
echo ""
|
|
|
|
ssh "${Q6A}" "
|
|
cd ~/${DEPLOY_DIR}
|
|
echo '--- llama-cli version ---'
|
|
./llama-cli --version 2>&1 || true
|
|
echo ''
|
|
echo '--- Running infererence with GGML_HEXAGON=1 ---'
|
|
GGML_HEXAGON=1 LD_LIBRARY_PATH=. ./llama-cli \
|
|
-m '${MODEL}' \
|
|
-n '${N_TOKENS}' \
|
|
-p '${PROMPT}' \
|
|
-ngl 0 \
|
|
--no-display-prompt \
|
|
2>&1
|
|
echo ''
|
|
echo '--- exit: ' $? '---'
|
|
" 2>&1
|