From 07b7709362767ae80b56dabe15234dccdbf46897 Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Fri, 20 Feb 2026 12:51:44 +0100 Subject: [PATCH 1/4] debug: ztest: reduce delay between tests Currently ztest delay is set to 100ms, which adds that time between each too ztest runs. This very quickly adds up to cause an IPC timeout in the kernel driver. That timeout isn't needed for SOF, set it to 1ms. Signed-off-by: Guennadi Liakhovetski --- app/debug_overlay.conf | 1 + 1 file changed, 1 insertion(+) diff --git a/app/debug_overlay.conf b/app/debug_overlay.conf index 914210d4184d..976430b90ed4 100644 --- a/app/debug_overlay.conf +++ b/app/debug_overlay.conf @@ -3,6 +3,7 @@ CONFIG_ASSERT=y CONFIG_ZTEST_NO_YIELD=n CONFIG_ZTEST_SUMMARY=n +CONFIG_ZTEST_TEST_DELAY_MS=1 CONFIG_SOF_BOOT_TEST_ALLOWED=y CONFIG_TEST_EXTRA_STACK_SIZE=7168 From 9c087bda00d191bb26431b86156dcdb2bd383200 Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Fri, 20 Feb 2026 12:54:25 +0100 Subject: [PATCH 2/4] zephyr: temporarily use a pool of stacks When dynamically allocating thread stack with the current Zephyr implementation it's allocated uncached. So far there is no proper solution to support fully dynamic cached stack. Switch to the system stack pool until a solution is implemented. This only affects userspace threads currently. Signed-off-by: Guennadi Liakhovetski --- app/overlays/ptl/userspace_overlay.conf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/overlays/ptl/userspace_overlay.conf b/app/overlays/ptl/userspace_overlay.conf index 331e2b4280d6..555e27ba7189 100644 --- a/app/overlays/ptl/userspace_overlay.conf +++ b/app/overlays/ptl/userspace_overlay.conf @@ -4,7 +4,8 @@ CONFIG_MAX_THREAD_BYTES=4 CONFIG_INIT_STACKS=n CONFIG_THREAD_STACK_INFO=n -CONFIG_DYNAMIC_THREAD_PREFER_ALLOC=y +CONFIG_DYNAMIC_THREAD_PREFER_POOL=y +CONFIG_DYNAMIC_THREAD_PREFER_ALLOC=n CONFIG_DYNAMIC_THREAD=y CONFIG_DYNAMIC_THREAD_POOL_SIZE=4 CONFIG_DYNAMIC_THREAD_ALLOC=n From a92d0d28080b029d4ab3cc62e919e3d8095279aa Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Fri, 20 Feb 2026 13:01:10 +0100 Subject: [PATCH 3/4] zephyr: bump CONFIG_MAX_THREAD_BYTES to 4 3 bytes per object aren't enough any more. Increase to 4 which still fits in an atomic 32-bit field. Signed-off-by: Guennadi Liakhovetski --- app/boards/intel_adsp_ace30_ptl.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/boards/intel_adsp_ace30_ptl.conf b/app/boards/intel_adsp_ace30_ptl.conf index 12aa78162c06..8f88f4ea82f1 100644 --- a/app/boards/intel_adsp_ace30_ptl.conf +++ b/app/boards/intel_adsp_ace30_ptl.conf @@ -74,4 +74,4 @@ CONFIG_DYNAMIC_THREAD_ALLOC=y CONFIG_DYNAMIC_THREAD_PREFER_ALLOC=y CONFIG_SOF_STACK_SIZE=8192 CONFIG_SOF_USERSPACE_PROXY=y -CONFIG_MAX_THREAD_BYTES=3 +CONFIG_MAX_THREAD_BYTES=4 From a310f28498c452486d30cf85da2c7191d0ce5da8 Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Fri, 20 Feb 2026 13:03:17 +0100 Subject: [PATCH 4/4] boot-test: add a userspace performance test Add a test to compare kernel and userspace performance when performing tight loop calculations with no API calls. Signed-off-by: Guennadi Liakhovetski --- zephyr/test/CMakeLists.txt | 1 + zephyr/test/userspace/test_perf.c | 172 ++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+) create mode 100644 zephyr/test/userspace/test_perf.c diff --git a/zephyr/test/CMakeLists.txt b/zephyr/test/CMakeLists.txt index c5b66c83bbaa..5e4c06da864e 100644 --- a/zephyr/test/CMakeLists.txt +++ b/zephyr/test/CMakeLists.txt @@ -4,6 +4,7 @@ if(CONFIG_SOF_BOOT_TEST) ) zephyr_library_sources_ifdef(CONFIG_USERSPACE userspace/ksem.c + userspace/test_perf.c ) endif() diff --git a/zephyr/test/userspace/test_perf.c b/zephyr/test/userspace/test_perf.c new file mode 100644 index 000000000000..7b174d7ba125 --- /dev/null +++ b/zephyr/test/userspace/test_perf.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Copyright(c) 2026 Intel Corporation. */ + +/* Test kernel vs. user-space performance. */ + +#include +#include + +#include +#include +#include + +LOG_MODULE_DECLARE(sof_boot_test, LOG_LEVEL_DBG); + +static int load_add(void) +{ +#define N_ADD (1000 * 1000 * 100) + unsigned long r = 0; + + for (unsigned int i = 0; i < N_ADD; i++) + r += i; +#define N_DIV 10000 + for (unsigned int i = 1; i <= N_DIV; i++) + r = r / (i % 10 + 1) * (i % 10 + 3); + return (int)r; +} + +#ifdef __XCC__ +#include + +/* Compute dot product of two vectors using HiFi4 SIMD instructions */ +static int32_t dot_product_hifi4(const int16_t *a, const int16_t *b, int length) +{ + ae_int64 acc = AE_ZERO64(); /* 1. Initialize accumulator to zero */ + ae_int16x4 *pa = (ae_int16x4 *)a; /* Pointer to vector a */ + ae_int16x4 *pb = (ae_int16x4 *)b; /* Pointer to vector b */ + + for (int i = 0; i < length / 4; i++) { + ae_int16x4 va, vb; + + AE_L16X4_IP(va, pa, 8); /* 2. Load 4x 16-bit values from a */ + AE_L16X4_IP(vb, pb, 8); /* 3. Load 4x 16-bit values from b */ + AE_MULAAAAQ16(acc, va, vb); /* 4. Multiply-accumulate (4 MACs in parallel) */ + } + + return AE_TRUNCA32F64S(acc, 0); /* 5. Convert 64-bit result to 32-bit */ +} + +#define VECTOR_LENGTH 100 +static int load_hifi4(void) +{ + uint16_t a[VECTOR_LENGTH], b[VECTOR_LENGTH]; + int ret = 0; + + for (unsigned int j = 0; j < 1000; j++) { + for (unsigned int i = 0; i < VECTOR_LENGTH; i++) { + a[i] = i * 3 - 47 * j; + b[i] = 411 * j - i * 5; + } + + ret += dot_product_hifi4(a, b, VECTOR_LENGTH); + } + return ret; +} +#endif /* __XCC__ */ + +typedef int (*load_fn_t)(void); + +load_fn_t load_fn[] = { + load_add, +#ifdef __XCC__ + load_hifi4, +#endif +}; + +static unsigned int test_perf(load_fn_t fn, struct k_event *event, + struct k_sem *sem) +{ + uint64_t start = k_uptime_ticks(); + + k_event_set(event, (uint32_t)fn); + + int ret = k_sem_take(sem, K_MSEC(200)); + + zassert_ok(ret); + + uint64_t end = k_uptime_ticks(); + + return (unsigned int)(end - start); +} + +static void thread_fn(void *p1, void *p2, void *p3) +{ + struct k_event *event = p1; + struct k_sem *sem = p2; + bool first = true; + + for (;;) { + load_fn_t fn = (load_fn_t)k_event_wait(event, 0xffffffff, !first, K_FOREVER); + + first = false; + LOG_INF("fn %p ret %d", (void *)fn, fn()); + + k_sem_give(sem); + } +} + +#define STACK_SIZE 4096 + +ZTEST(sof_boot, test_perf) +{ + /* Synchronization objects allocated on original uncached heap */ + struct k_event *u_event = k_object_alloc(K_OBJ_EVENT); + struct k_event *k_event = k_object_alloc(K_OBJ_EVENT); + + zassert_not_null(u_event); + zassert_not_null(k_event); + + k_event_init(u_event); + k_event_init(k_event); + + struct k_sem *sem = k_object_alloc(K_OBJ_SEM); + + zassert_not_null(sem); + k_sem_init(sem, 0, 1); + + /* Allocate kernel stack and thread and start it */ + struct k_thread *k_thread = k_object_alloc(K_OBJ_THREAD); + + zassert_not_null(k_thread); + /* Important: Xtensa thread initialization code checks certain fields for 0 */ + memset(&k_thread->arch, 0, sizeof(k_thread->arch)); + + k_thread_stack_t *k_stack = k_thread_stack_alloc(STACK_SIZE, 0); + + zassert_not_null(k_stack); + + struct k_thread *pk_thread = k_thread_create(k_thread, k_stack, STACK_SIZE, thread_fn, + k_event, sem, NULL, 0, 0, K_FOREVER); + + k_thread_start(pk_thread); + + /* Allocate userspace stack and thread and start it */ + struct k_thread *u_thread = k_object_alloc(K_OBJ_THREAD); + + zassert_not_null(u_thread); + memset(&u_thread->arch, 0, sizeof(u_thread->arch)); + + k_thread_stack_t *u_stack = k_thread_stack_alloc(STACK_SIZE, K_USER); + + zassert_not_null(u_stack); + + struct k_thread *pu_thread = k_thread_create(u_thread, u_stack, STACK_SIZE, thread_fn, + u_event, sem, NULL, 0, K_USER, K_FOREVER); + + zassert_not_null(pu_thread); + k_thread_access_grant(pu_thread, u_event, sem); + k_thread_start(pu_thread); + + for (unsigned int i = 0; i < ARRAY_SIZE(load_fn); i++) { + LOG_INF("user: fn %p took %u", load_fn[i], test_perf(load_fn[i], u_event, sem)); + LOG_INF("kernel: fn %p took %u", load_fn[i], test_perf(load_fn[i], k_event, sem)); + } + + k_thread_abort(pu_thread); + k_thread_stack_free(u_stack); + k_thread_abort(pk_thread); + k_thread_stack_free(k_stack); + k_object_free(sem); + k_object_free(u_event); + k_object_free(k_event); +}