commit 18970e3258fa79a5438902f58973556ee9abdf74 Author: Jimmy Devine Date: Sat May 2 10:28:51 2026 +0200 Initial commit: Q6A Hexagon v68 + llama.cpp guide Complete documentation for running llama.cpp with the Qualcomm Hexagon CDSP v68 NPU backend on a Radxa Dragon Q6A (SA8775P) board. Includes: - Corrected FastRPC test harness (libcdsprpc handles INIT_CREATE) - Minimal DSP stub library - Cross-compile build script for llama.cpp - Deploy and test scripts for Q6A - Kernel FastRPC header for reference - Comprehensive README with lessons learned Key findings: - Do NOT call FASTRPC_IOCTL_INIT_CREATE manually - Must link against Q6A system libcdsprpc (not SDK cross-compiled) - Build verified: 32 t/s prompt, 4.5 t/s generation on 1B model diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..61a1345 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.o +*.so +*.a +llama-cli +test_fastrpc_fixed +.DS_Store +*.swp diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..f95c299 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,45 @@ +# Q6A Hexagon Guide — AGENTS.md + +This repo documents how to get llama.cpp running with the Qualcomm Hexagon CDSP v68 backend on a Radxa Dragon Q6A board (SA8775P). + +## Key Rules + +1. **Do NOT call FASTRPC_IOCTL_INIT_CREATE manually.** Let libcdsprpc handle it. +2. **Always link against Q6A system libcdsprpc** (`/usr/lib/libcdsprpc.so.1`), not the SDK's cross-compiled version. +3. **Do NOT set CMAKE_SYSROOT** in the cross-compile — it conflicts with Ubuntu's cross-compiler linker scripts. +4. **Use rpcmem_alloc for DSP compute buffers** — stack arrays only work for tiny buffers (~4KB fragile slow path). + +## Build Command + +```bash +cd ~/llama.cpp +bash scripts/build-hexagon.sh +``` + +## Deploy Command + +```bash +Q6A=radxa@192.168.1.11 bash scripts/deploy-to-q6a.sh +``` + +## Test Command + +```bash +bash scripts/test-on-q6a.sh +``` + +## File Reference + +- `src/test_fastrpc_fixed.c` — Correct init sequence (reference for how to open HTP handles) +- `src/htp_minimal_impl.c` — Minimal DSP stub (for testing, full library works instead) +- `scripts/build-hexagon.sh` — llama.cpp cmake build for aarch64 + Hexagon +- `scripts/deploy-to-q6a.sh` — Deploy to Q6A +- `scripts/test-on-q6a.sh` — Run inference test on Q6A +- `references/fastrpc.h` — FastRPC ioctl definitions from Q6A kernel +- `README.md` — Full guide with troubleshooting + +## Performance Baseline + +- Prompt processing: ~32 t/s (on 8 CPU cores) +- Generation: ~4.5 t/s +- Model: llama-3.2-1b-q4km.gguf (1B params, Q4_K_M) diff --git a/README.md b/README.md new file mode 100644 index 0000000..94a2fa3 --- /dev/null +++ b/README.md @@ -0,0 +1,138 @@ +# Q6A Hexagon v68 + llama.cpp — Complete Guide + +This repo documents how to get llama.cpp running with the **Qualcomm Hexagon CDSP v68** (NPU/DSP) backend on a **Radxa Dragon Q6A** board (SA8775P). + +## Overview + +The Q6A has a Qualcomm QCS6490 SoC with a Hexagon CDSP v68 that can accelerate matrix operations in llama.cpp via FastRPC. The key insight from weeks of debugging: **let libcdsprpc handle `FASTRPC_IOCTL_INIT_CREATE` internally** — do NOT attempt it manually. Use the system's `libcdsprpc.so`, not the SDK's cross-compiled version. + +## Prerequisites + +### Build Machine (x86_64) +- Ubuntu 24.04 (or similar with cross-compilation packages) +- Packages: + ```bash + sudo apt install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu ninja-build cmake + sudo apt install libc6-arm64-cross libc6-dev-arm64-cross + ``` +- Qualcomm Hexagon SDK 5.5.6.0 (with Tools 8.7.06) at `/local/mnt/workspace/Qualcomm/Hexagon_SDK/5.5.6.0/` + - Must include `hexagon-clang` at `tools/HEXAGON_Tools/8.7.06/Tools/bin/` + - Must include `qaic` IDL compiler at `tools/qaic/bin/qaic` + - Must include `incs/` with SDK headers + - Must include `ipc/fastrpc/` with libcdsprpc and rpcmem headers + +### Target Machine (Q6A — aarch64) +- Radxa Dragon Q6A (SA8775P) running Ubuntu 24.04 +- `fastrpc` package installed: `sudo apt install fastrpc fastrpc-test` +- User `radxa` in `render` group (for `/dev/fastrpc-cdsp-secure` access) +- CDSP firmware running: `cat /sys/class/remoteproc/remoteproc1/state` → `running` + +## Quick Start + +### 1. Build llama.cpp with Hexagon backend + +```bash +cd ~/llama.cpp +bash scripts/build-hexagon.sh +``` + +This cross-compiles llama.cpp for aarch64 with `-DGGML_HEXAGON=ON`. Output goes to `build-hexagon/bin/`. + +### 2. Deploy to Q6A + +```bash +# Deploy ARM64 binaries +scp build-hexagon/bin/llama-cli radxa@192.168.1.11:~/llama/bin/ +scp build-hexagon/bin/libggml*.so* radxa@192.168.1.11:~/llama/bin/ +scp build-hexagon/bin/libllama.so* radxa@192.168.1.11:~/llama/bin/ + +# Deploy DSP skel +scp build-hexagon/ggml/src/ggml-hexagon/libggml-htp-v68.so radxa@192.168.1.11:/tmp/ +ssh radxa@192.168.1.11 "sudo cp /tmp/libggml-htp-v68.so /usr/lib/dsp/cdsp/" +``` + +### 3. Run inference test + +```bash +ssh radxa@192.168.1.11 +cd ~/llama/bin +GGML_HEXAGON=1 LD_LIBRARY_PATH=. ./llama-cli \ + -m ~/models/llama-3.2-1b-q4km.gguf \ + -n 32 -p "Hello, what is your name?" -ngl 0 +``` + +Expected output: +``` +ggml-hex: Loading driver libcdsprpc.so +ggml-hex: Hexagon Arch version v68 +ggml-hex: new session: HTP0 : session-id 0 domain-id 3 ... +[ Prompt: 32.8 t/s | Generation: 4.5 t/s ] +``` + +## Build Script Details + +The `scripts/build-hexagon.sh` script: + +1. **CMake configure** with: + - `-DCMAKE_BUILD_TYPE=Release` + - `-DBUILD_SHARED_LIBS=ON` (required for HTP plugin .so) + - `-DCMAKE_INSTALL_RPATH='$ORIGIN'` (libraries alongside binary) + - `-DGGML_HEXAGON=ON` + - `-DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_SERVER=OFF` + - `-DMAX_DOMAIN_NAMELEN=64` on both C and CXX flags + +2. **Do NOT set `CMAKE_SYSROOT`** — the cross-compiler's own linker scripts conflict with `--sysroot` on Ubuntu's `gcc-aarch64-linux-gnu` packages. + +3. **Do NOT set explicit OpenSSL paths** — they're unnecessary when `LLAMA_BUILD_SERVER=OFF`. + +## Critical Lessons Learned + +### Root Cause of `remote_handle64_open` Error 0xe + +The error occurs because **the SDK's cross-compiled `libcdsprpc.so` does NOT handle `FASTRPC_IOCTL_INIT_CREATE` internally** for unsigned PDs. The Q6A system `/usr/lib/libcdsprpc.so.1` does. The fix is always compile and link natively on the Q6A (or link against the system library). + +### Do NOT Call INIT_CREATE Manually + +Attempting `FASTRPC_IOCTL_INIT_CREATE` via ioctl on `/dev/fastrpc-cdsp-secure` always returns EINVAL because the kernel expects the struct to be set up by libcdsprpc's internal state machine. The correct approach: + +```c +/* ONLY these two calls are needed — libcdsprpc handles INIT_CREATE */ +remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, ...); +remote_handle64_open(uri, &handle); +``` + +### Verified Q6A Constants + +| Item | Value | +|------|-------| +| CDSP device node | `/dev/fastrpc-cdsp-secure` | +| Shell path | `/usr/lib/dsp/cdsp/fastrpc_shell_unsigned_3` | +| Domain ID | `CDSP_DOMAIN_ID` = 3 | +| Unsigned module flag | `FASTRPC_MODE_UNSIGNED_MODULE` = `(1 << 3)` = 0x8 | +| DSP .so path | `/usr/lib/dsp/cdsp/` | +| System libcdsprpc | `/usr/lib/libcdsprpc.so.1` (symlink at `/usr/lib/libcdsprpc.so` already exists) | +| Kernel header | `/usr/src/linux-headers-6.18.2-3-qcom/include/uapi/misc/fastrpc.h` | + +### dspqueue Symbols + +All required `dspqueue_*` symbols are present in the SA8775P system `libcdsprpc.so.1`: +`dspqueue_create`, `dspqueue_close`, `dspqueue_export`, `dspqueue_write`, `dspqueue_read`, etc. + +### Known Issues / Future Work + +- **Minimal stub library** (`htp_minimal_impl.c`) still fails to load on the DSP with error `0x80000442` (likely missing initialization that the full library does in its `main.c`). The full `libggml-htp-v68.so` (generated by the cmake build from `ggml-hexagon/main.c`) works correctly. +- **4.5 tok/s generation speed** is CPU-bound with partial DSP offload. More aggressive offloading of matrix ops to the NPU could improve this. +- **DSP library is rebuilt every time** the cmake build runs. You don't need to touch it unless you modify the Hexagon backend C code. +- The `htp_iface.idl` declares `dst` as `in sequence` (input-only) but it's actually an output. Fix upstream to `rout` for correctness. + +## Files in This Repo + +| File | Purpose | +|------|---------| +| `src/test_fastrpc_fixed.c` | Corrected test harness with proper init sequence | +| `src/htp_minimal_impl.c` | Minimal DSP stub (for experimentation) | +| `scripts/build-hexagon.sh` | Cross-compile script for llama.cpp with GGML_HEXAGON=ON | +| `scripts/deploy-to-q6a.sh` | Deploy built binaries + DSP .so to Q6A | +| `scripts/test-on-q6a.sh` | Run full inference test on Q6A | +| `references/fastrpc.h` | Q6A kernel header (ioctl struct definitions) | +| `AGENTS.md` | Context for AI coding agents working with this codebase | diff --git a/references/fastrpc.h b/references/fastrpc.h new file mode 100644 index 0000000..c6e2925 --- /dev/null +++ b/references/fastrpc.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef __QCOM_FASTRPC_H__ +#define __QCOM_FASTRPC_H__ + +#include + +#define FASTRPC_IOCTL_ALLOC_DMA_BUFF _IOWR('R', 1, struct fastrpc_alloc_dma_buf) +#define FASTRPC_IOCTL_FREE_DMA_BUFF _IOWR('R', 2, __u32) +#define FASTRPC_IOCTL_INVOKE _IOWR('R', 3, struct fastrpc_invoke) +#define FASTRPC_IOCTL_INIT_ATTACH _IO('R', 4) +#define FASTRPC_IOCTL_INIT_CREATE _IOWR('R', 5, struct fastrpc_init_create) +#define FASTRPC_IOCTL_MMAP _IOWR('R', 6, struct fastrpc_req_mmap) +#define FASTRPC_IOCTL_MUNMAP _IOWR('R', 7, struct fastrpc_req_munmap) +#define FASTRPC_IOCTL_INIT_ATTACH_SNS _IO('R', 8) +#define FASTRPC_IOCTL_INIT_CREATE_STATIC _IOWR('R', 9, struct fastrpc_init_create_static) +#define FASTRPC_IOCTL_MEM_MAP _IOWR('R', 10, struct fastrpc_mem_map) +#define FASTRPC_IOCTL_MEM_UNMAP _IOWR('R', 11, struct fastrpc_mem_unmap) +#define FASTRPC_IOCTL_GET_DSP_INFO _IOWR('R', 13, struct fastrpc_ioctl_capability) + +/** + * enum fastrpc_map_flags - control flags for mapping memory on DSP user process + * @FASTRPC_MAP_STATIC: Map memory pages with RW- permission and CACHE WRITEBACK. + * The driver is responsible for cache maintenance when passed + * the buffer to FastRPC calls. Same virtual address will be + * assigned for subsequent FastRPC calls. + * @FASTRPC_MAP_RESERVED: Reserved + * @FASTRPC_MAP_FD: Map memory pages with RW- permission and CACHE WRITEBACK. + * Mapping tagged with a file descriptor. User is responsible for + * CPU and DSP cache maintenance for the buffer. Get virtual address + * of buffer on DSP using HAP_mmap_get() and HAP_mmap_put() APIs. + * @FASTRPC_MAP_FD_DELAYED: Mapping delayed until user call HAP_mmap() and HAP_munmap() + * functions on DSP. It is useful to map a buffer with cache modes + * other than default modes. User is responsible for CPU and DSP + * cache maintenance for the buffer. + * @FASTRPC_MAP_FD_NOMAP: This flag is used to skip CPU mapping, + * otherwise behaves similar to FASTRPC_MAP_FD_DELAYED flag. + * @FASTRPC_MAP_MAX: max count for flags + * + */ +enum fastrpc_map_flags { + FASTRPC_MAP_STATIC = 0, + FASTRPC_MAP_RESERVED, + FASTRPC_MAP_FD = 2, + FASTRPC_MAP_FD_DELAYED, + FASTRPC_MAP_FD_NOMAP = 16, + FASTRPC_MAP_MAX, +}; + +enum fastrpc_proc_attr { + /* Macro for Debug attr */ + FASTRPC_MODE_DEBUG = (1 << 0), + /* Macro for Ptrace */ + FASTRPC_MODE_PTRACE = (1 << 1), + /* Macro for CRC Check */ + FASTRPC_MODE_CRC = (1 << 2), + /* Macro for Unsigned PD */ + FASTRPC_MODE_UNSIGNED_MODULE = (1 << 3), + /* Macro for Adaptive QoS */ + FASTRPC_MODE_ADAPTIVE_QOS = (1 << 4), + /* Macro for System Process */ + FASTRPC_MODE_SYSTEM_PROCESS = (1 << 5), + /* Macro for Prvileged Process */ + FASTRPC_MODE_PRIVILEGED = (1 << 6), +}; + +/* Fastrpc attribute for memory protection of buffers */ +#define FASTRPC_ATTR_SECUREMAP (1) + +struct fastrpc_invoke_args { + __u64 ptr; + __u64 length; + __s32 fd; + __u32 attr; +}; + +struct fastrpc_invoke { + __u32 handle; + __u32 sc; + __u64 args; +}; + +struct fastrpc_init_create { + __u32 filelen; /* elf file length */ + __s32 filefd; /* fd for the file */ + __u32 attrs; + __u32 siglen; + __u64 file; /* pointer to elf file */ +}; + +struct fastrpc_init_create_static { + __u32 namelen; /* length of pd process name */ + __u32 memlen; + __u64 name; /* pd process name */ +}; + +struct fastrpc_alloc_dma_buf { + __s32 fd; /* fd */ + __u32 flags; /* flags to map with */ + __u64 size; /* size */ +}; + +struct fastrpc_req_mmap { + __s32 fd; + __u32 flags; /* flags for dsp to map with */ + __u64 vaddrin; /* optional virtual address */ + __u64 size; /* size */ + __u64 vaddrout; /* dsp virtual address */ +}; + +struct fastrpc_mem_map { + __s32 version; + __s32 fd; /* fd */ + __s32 offset; /* buffer offset */ + __u32 flags; /* flags defined in enum fastrpc_map_flags */ + __u64 vaddrin; /* buffer virtual address */ + __u64 length; /* buffer length */ + __u64 vaddrout; /* [out] remote virtual address */ + __s32 attrs; /* buffer attributes used for SMMU mapping */ + __s32 reserved[4]; +}; + +struct fastrpc_req_munmap { + __u64 vaddrout; /* address to unmap */ + __u64 size; /* size */ +}; + +struct fastrpc_mem_unmap { + __s32 vesion; + __s32 fd; /* fd */ + __u64 vaddr; /* remote process (dsp) virtual address */ + __u64 length; /* buffer size */ + __s32 reserved[5]; +}; + +struct fastrpc_ioctl_capability { + __u32 unused; /* deprecated, ignored by the kernel */ + __u32 attribute_id; + __u32 capability; /* dsp capability */ + __u32 reserved[4]; +}; + +#endif /* __QCOM_FASTRPC_H__ */ diff --git a/scripts/build-hexagon.sh b/scripts/build-hexagon.sh new file mode 100755 index 0000000..ef041bc --- /dev/null +++ b/scripts/build-hexagon.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +HEXAGON_SDK_ROOT=/local/mnt/workspace/Qualcomm/Hexagon_SDK/5.5.6.0 +HEXAGON_TOOLS_ROOT=${HEXAGON_SDK_ROOT}/tools/HEXAGON_Tools/8.7.06 +LLAMA_SRC="$(cd "$(dirname "$0")" && pwd)" +BUILD_DIR="${LLAMA_SRC}/build-hexagon" + +cmake -B "${BUILD_DIR}" -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_INSTALL_RPATH='$ORIGIN' \ + -DCMAKE_BUILD_RPATH='$ORIGIN' \ + -DCMAKE_SYSTEM_PROCESSOR=aarch64 \ + -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \ + -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DGGML_HEXAGON=ON \ + -DHEXAGON_SDK_ROOT="${HEXAGON_SDK_ROOT}" \ + -DHEXAGON_TOOLS_ROOT="${HEXAGON_TOOLS_ROOT}" \ + "-DCMAKE_C_FLAGS=-DMAX_DOMAIN_NAMELEN=64" \ + "-DCMAKE_CXX_FLAGS=-DMAX_DOMAIN_NAMELEN=64" \ + "${LLAMA_SRC}" + +cmake --build "${BUILD_DIR}" -- -j$(nproc) diff --git a/scripts/deploy-to-q6a.sh b/scripts/deploy-to-q6a.sh new file mode 100755 index 0000000..8fdc9a1 --- /dev/null +++ b/scripts/deploy-to-q6a.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# deploy-to-q6a.sh — Deploy llama.cpp ARM binaries + DSP .so to Q6A +set -euo pipefail + +Q6A="${Q6A:-radxa@192.168.1.11}" +BUILD_DIR="${BUILD_DIR:-$HOME/llama.cpp/build-hexagon}" +DEPLOY_DIR="${DEPLOY_DIR:-llama/bin}" + +echo "=== Deploying to ${Q6A}:${DEPLOY_DIR} ===" + +# Check build artifacts exist +for f in llama-cli libggml-hexagon.so libggml-hexagon.so.0 libggml-hexagon.so.0.9.11 \ + libggml-base.so libggml-base.so.0 libggml-base.so.0.9.11 \ + libggml-cpu.so libggml-cpu.so.0 libggml-cpu.so.0.9.11 \ + libggml.so libggml.so.0 libggml.so.0.9.11 \ + libllama.so libllama.so.0; do + if [ ! -f "${BUILD_DIR}/bin/${f}" ]; then + echo "WARNING: ${f} not found — build may be incomplete" + fi +done + +# Create deploy dir +ssh "${Q6A}" "mkdir -p ~/${DEPLOY_DIR}" + +# Deploy ARM binaries +scp "${BUILD_DIR}/bin/llama-cli" "${Q6A}:~/${DEPLOY_DIR}/" +scp "${BUILD_DIR}/bin/libggml-hexagon.so" "${BUILD_DIR}/bin/libggml-hexagon.so".* "${Q6A}:~/${DEPLOY_DIR}/" 2>/dev/null || true +scp "${BUILD_DIR}/bin/libggml-base.so" "${BUILD_DIR}/bin/libggml-base.so".* "${Q6A}:~/${DEPLOY_DIR}/" 2>/dev/null || true +scp "${BUILD_DIR}/bin/libggml-cpu.so" "${BUILD_DIR}/bin/libggml-cpu.so".* "${Q6A}:~/${DEPLOY_DIR}/" 2>/dev/null || true +scp "${BUILD_DIR}/bin/libggml.so" "${BUILD_DIR}/bin/libggml.so".* "${Q6A}:~/${DEPLOY_DIR}/" 2>/dev/null || true +scp "${BUILD_DIR}/bin/libllama.so" "${BUILD_DIR}/bin/libllama.so".* "${Q6A}:~/${DEPLOY_DIR}/" 2>/dev/null || true + +# Deploy DSP skel +DSP_SO="${BUILD_DIR}/ggml/src/ggml-hexagonal/libggml-htp-v68.so" +if [ -f "$DSP_SO" ]; then + scp "$DSP_SO" "${Q6A}:/tmp/" + ssh "${Q6A}" "echo radxa | sudo -S cp /tmp/libggml-htp-v68.so /usr/lib/dsp/cdsp/libggml-htp-v68.so" + echo "DSP .so deployed" +else + echo "WARNING: DSP .so not found at $DSP_SO" +fi + +echo "=== Deploy complete ===" +ssh "${Q6A}" "ls -la ~/${DEPLOY_DIR}/" diff --git a/scripts/test-on-q6a.sh b/scripts/test-on-q6a.sh new file mode 100755 index 0000000..2faf394 --- /dev/null +++ b/scripts/test-on-q6a.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# test-on-q6a.sh — Run llama-cli inference test on Q6A with Hexagon backend +set -euo pipefail + +Q6A="${Q6A:-radxa@192.168.1.11}" +MODEL="${MODEL:-/home/radxa/models/llama-3.2-1b-q4km.gguf}" +DEPLOY_DIR="${DEPLOY_DIR:-llama/bin}" +N_TOKENS="${N_TOKENS:-32}" +PROMPT="${PROMPT:-Hello, what is your name?}" + +echo "=== Running inference test on Q6A ===" +echo "Model: ${MODEL}" +echo "Tokens: ${N_TOKENS}" +echo "" + +ssh "${Q6A}" " + cd ~/${DEPLOY_DIR} + echo '--- llama-cli version ---' + ./llama-cli --version 2>&1 || true + echo '' + echo '--- Running infererence with GGML_HEXAGON=1 ---' + GGML_HEXAGON=1 LD_LIBRARY_PATH=. ./llama-cli \ + -m '${MODEL}' \ + -n '${N_TOKENS}' \ + -p '${PROMPT}' \ + -ngl 0 \ + --no-display-prompt \ + 2>&1 + echo '' + echo '--- exit: ' $? '---' +" 2>&1 diff --git a/src/htp_minimal_impl.c b/src/htp_minimal_impl.c new file mode 100644 index 0000000..476ddd4 --- /dev/null +++ b/src/htp_minimal_impl.c @@ -0,0 +1,63 @@ +// Minimal DSP test library - no dspqueue dependencies +// Tests whether basic FastRPC skeleton loading works +// Implements the htp_iface interface with correct signatures + +#include "htp_iface.h" +#include + +// Static sentinel for handle — gives a valid pointer address +// without malloc. The QAIC-generated skel dispatcher uses the +// handle value as an opaque pointer; a static variable ensures +// the address is always valid. +static int _htp_ctx_sentinel = 0; + +// Stub implementations - just return success + +AEEResult htp_iface_open(const char *uri, remote_handle64 *handle) { + (void)uri; + *handle = (remote_handle64)(uintptr_t)&_htp_ctx_sentinel; + return AEE_SUCCESS; +} + +AEEResult htp_iface_close(remote_handle64 handle) { + (void)handle; + return AEE_SUCCESS; +} + +AEEResult htp_iface_start(remote_handle64 _h, uint32 sess_id, + uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx) { + (void)_h; (void)sess_id; (void)dsp_queue_id; + (void)n_hvx; (void)use_hmx; + return AEE_SUCCESS; +} + +AEEResult htp_iface_stop(remote_handle64 _h) { + (void)_h; + return AEE_SUCCESS; +} + +AEEResult htp_iface_enable_etm(remote_handle64 _h) { + (void)_h; + return AEE_SUCCESS; +} + +AEEResult htp_iface_disable_etm(remote_handle64 _h) { + (void)_h; + return AEE_SUCCESS; +} + +AEEResult htp_iface_compute(remote_handle64 _h, + const uint8 *op_data, int op_dataLen, + const uint8 *src0, int src0Len, + const uint8 *src1, int src1Len, + const uint8 *dst, int dstLen) { + // dst is declared 'const' in IDL (should be 'rout', not 'in'), + // but on Hexagon DSP there's no MMU write protection. + // This works in practice; fix the IDL upstream if submitting. + (void)_h; (void)op_data; (void)op_dataLen; + (void)src1; (void)src1Len; + if (dstLen >= src0Len && src0Len > 0) { + memcpy((void*)dst, src0, src0Len); + } + return AEE_SUCCESS; +} diff --git a/src/test_fastrpc_fixed.c b/src/test_fastrpc_fixed.c new file mode 100644 index 0000000..d3bc013 --- /dev/null +++ b/src/test_fastrpc_fixed.c @@ -0,0 +1,110 @@ +/* + * test_fastrpc_fixed.c + * Correct FastRPC usage: let libcdsprpc handle INIT_CREATE internally. + * + * Key findings (after debugging on Q6A SA8775P): + * 1. Do NOT call FASTRPC_IOCTL_INIT_CREATE manually — libcdsprpc handles it + * 2. Must compile natively on Q6A (or link against Q6A system libcdsprpc) + * 3. remote_session_control(UNSIGNED_MODULE) is required before handle open + * 4. Compute buffers must use rpcmem_alloc (not stack/malloc) for DMA access + * + * Compile on Q6A: + * gcc -O2 -o test_fastrpc_fixed test_fastrpc_fixed.c \ + * htp_iface_stub.c -lcdsprpc -lpthread + * + * Run: + * ./test_fastrpc_fixed [shell_path] [uri] + */ +#include +#include +#include +#include + +#include +#include +#include + +/* Externs from htp_iface_stub.c (generated by QAIC from htp_iface.idl) */ +extern int htp_iface_open(const char *uri, remote_handle64 *handle); +extern int htp_iface_close(remote_handle64 handle); +extern int htp_iface_compute(remote_handle64 handle, + const uint8 *op_data, int op_data_len, + const uint8 *src0, int src0_len, + const uint8 *src1, int src1_len, + const uint8 *dst, int dst_len); + +int main(int argc, char **argv) +{ + const char *uri = argc > 1 ? argv[1] + : "file:///libggml-htp-v68.so" + "?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0"; + + printf("URI: %s\n", uri); + + /* Step 1: Enable unsigned PD on CDSP */ + { + struct remote_rpc_control_unsigned_module u = { + .domain = CDSP_DOMAIN_ID, + .enable = 1, + }; + int err = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, + &u, sizeof(u)); + printf("remote_session_control(UNSIGNED): 0x%x\n", err); + if (err != AEE_SUCCESS) { + fprintf(stderr, "FAILED: unsigned PD not supported\n"); + return 1; + } + } + + /* Step 2: Open the HTP handle — libcdsprpc handles INIT_CREATE */ + remote_handle64 handle = 0; + int err = htp_iface_open(uri, &handle); + printf("htp_iface_open: 0x%x handle=0x%llx\n", + err, (unsigned long long)handle); + if (err != AEE_SUCCESS || !handle) { + fprintf(stderr, "Handle open FAILED: 0x%x\n", err); + return 1; + } + + /* Step 3: Compute — use rpcmem for DSP-accessible buffers */ + int buf_size = 1024; + uint8_t *op = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, + RPCMEM_DEFAULT_FLAGS, buf_size); + uint8_t *src = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, + RPCMEM_DEFAULT_FLAGS, buf_size); + uint8_t *dst = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, + RPCMEM_DEFAULT_FLAGS, buf_size); + + if (!op || !src || !dst) { + fprintf(stderr, "rpcmem_alloc failed\n"); + htp_iface_close(handle); + return 1; + } + + memset(src, 0x42, buf_size); + memset(dst, 0, buf_size); + memset(op, 0, buf_size); + + printf("Calling htp_iface_compute (%d bytes)...\n", buf_size); + err = htp_iface_compute(handle, + op, buf_size, + src, buf_size, + src, buf_size, + dst, buf_size); + printf("htp_iface_compute: 0x%x\n", err); + + if (err == AEE_SUCCESS) { + int non_zero = 0; + for (int i = 0; i < buf_size; i++) + if (dst[i] != 0) { non_zero = 1; break; } + printf("DSP wrote output: %s\n", non_zero ? "YES" : "NO"); + } + + rpcmem_free(op); + rpcmem_free(src); + rpcmem_free(dst); + + htp_iface_close(handle); + printf("Done.\n"); + return err != AEE_SUCCESS ? 1 : 0; +}