From d36d0806c9d9048fe8c1d1c9574bd668efa43618 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 3 Apr 2026 14:45:15 +0000 Subject: [PATCH 01/31] fix: put preempted Syscall(Executing) coroutines back in ready queue instead of syscall map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When SIGURG preempts a coroutine in Syscall(Executing) state (due to a race between the monitor sending SIGURG and MonitorListener removing the notify node), the scheduler was putting it in the syscall map without a syscall_suspend entry. This caused the coroutine to get stuck forever — no mechanism moves Syscall(Executing) coroutines from the syscall map back to the ready queue. Fix: detect Syscall(Executing) yields as preemption events and push the coroutine back to the ready queue. resume_with() already accepts Syscall(Executing) → Running transitions, so the coroutine resumes correctly from inside the signal handler. Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/2b3f1622-04e3-4f08-945a-e1e391d32eb0 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/scheduler.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/core/src/scheduler.rs b/core/src/scheduler.rs index dff9980d..c87d25d2 100644 --- a/core/src/scheduler.rs +++ b/core/src/scheduler.rs @@ -311,12 +311,17 @@ impl<'s> Scheduler<'s> { _ = RUNNING_COROUTINES.remove(&co_id); })? { CoroutineState::Syscall((), _, state) => { - //挂起协程到系统调用表 - //如果已包含,说明当前系统调用还有上层父系统调用,因此直接忽略插入结果 - _ = self.syscall.insert(co_id, coroutine); - if let SyscallState::Suspend(timestamp) = state { - self.syscall_suspend - .push(SyscallSuspendItem { timestamp, co_id }); + if let SyscallState::Executing = state { + //协程在系统调用执行期间被信号抢占,放回就绪队列以恢复执行 + self.ready.push(coroutine); + } else { + //挂起协程到系统调用表 + //如果已包含,说明当前系统调用还有上层父系统调用,因此直接忽略插入结果 + _ = self.syscall.insert(co_id, coroutine); + if let SyscallState::Suspend(timestamp) = state { + self.syscall_suspend + .push(SyscallSuspendItem { timestamp, co_id }); + } } } CoroutineState::Suspend((), timestamp) => { From 5581cff683b0893b537ac9d194cb27a87d36e998 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 3 Apr 2026 15:17:15 +0000 Subject: [PATCH 02/31] fix: handle Syscall(Executing) in try_resume and check_ready instead of do_schedule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert the do_schedule change per reviewer feedback. The correct fix follows the callback chain: syscall→event_loop→co_pool→scheduler via Scheduler::try_resume. 1. try_resume: handle Syscall(Executing) state so the event loop callback doesn't hit unreachable!() when a coroutine was preempted by SIGURG during syscall execution. 2. check_ready: scan the syscall map for Syscall(Executing) coroutines and move them back to the ready queue. This handles non-socket I/O (e.g., file_co) where no epoll event fires to trigger try_resume. Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/2431fc42-a1ac-4caa-b4d6-ccbd8e90c86b Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/scheduler.rs | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/core/src/scheduler.rs b/core/src/scheduler.rs index c87d25d2..675f1984 100644 --- a/core/src/scheduler.rs +++ b/core/src/scheduler.rs @@ -227,6 +227,8 @@ impl<'s> Scheduler<'s> { co.syscall(val, syscall, SyscallState::Callback) .expect("change syscall state failed"); } + //协程在系统调用执行期间被信号抢占,直接放回就绪队列 + CoroutineState::Syscall(_, _, SyscallState::Executing) => {} _ => unreachable!("try_resume unexpect CoroutineState"), } self.ready.push(co); @@ -311,17 +313,12 @@ impl<'s> Scheduler<'s> { _ = RUNNING_COROUTINES.remove(&co_id); })? { CoroutineState::Syscall((), _, state) => { - if let SyscallState::Executing = state { - //协程在系统调用执行期间被信号抢占,放回就绪队列以恢复执行 - self.ready.push(coroutine); - } else { - //挂起协程到系统调用表 - //如果已包含,说明当前系统调用还有上层父系统调用,因此直接忽略插入结果 - _ = self.syscall.insert(co_id, coroutine); - if let SyscallState::Suspend(timestamp) = state { - self.syscall_suspend - .push(SyscallSuspendItem { timestamp, co_id }); - } + //挂起协程到系统调用表 + //如果已包含,说明当前系统调用还有上层父系统调用,因此直接忽略插入结果 + _ = self.syscall.insert(co_id, coroutine); + if let SyscallState::Suspend(timestamp) = state { + self.syscall_suspend + .push(SyscallSuspendItem { timestamp, co_id }); } } CoroutineState::Suspend((), timestamp) => { @@ -389,6 +386,23 @@ impl<'s> Scheduler<'s> { } } } + // Check for coroutines preempted during syscall execution (SIGURG race). + // These have SyscallState::Executing and no syscall_suspend entry, + // so they would be stuck in the syscall map forever without this. + let executing: Vec = self + .syscall + .iter() + .filter(|entry| { + matches!( + entry.value().state(), + CoroutineState::Syscall(_, _, SyscallState::Executing) + ) + }) + .map(|entry| *entry.key()) + .collect(); + for co_id in executing { + self.try_resume(co_id); + } Ok(()) } From 9ecd4ea6a43560963919c252990e01b58175b542 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 3 Apr 2026 16:09:11 +0000 Subject: [PATCH 03/31] fix: reorder change_state to log before notifying MonitorListener Swap the order of info!() and on_state_changed() in change_state() so that MonitorListener submits the NOTIFY_NODE *after* the potentially slow logging call completes. On QEMU/loongarch64 the info!() call can take >10ms, exhausting the preemption interval before the coroutine can even resume, causing a live-lock where the monitor retries SIGURG every 1ms and the coroutine never makes progress. Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/cac264ee-7730-4b26-9bd4-dacab5bc93e5 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/coroutine/state.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/src/coroutine/state.rs b/core/src/coroutine/state.rs index 1ba91410..1b780e1b 100644 --- a/core/src/coroutine/state.rs +++ b/core/src/coroutine/state.rs @@ -18,12 +18,15 @@ where new_state: CoroutineState, ) -> CoroutineState { let old_state = self.state.replace(new_state); - self.on_state_changed(self, old_state, new_state); + //先打印日志再通知监听器,确保MonitorListener提交的NOTIFY_NODE时间戳 + //在日志输出之后,避免在QEMU等慢速平台上因日志输出耗时超过抢占间隔 + //导致协程被反复抢占无法推进的活锁问题 if let CoroutineState::Error(_) = new_state { error!("{} {:?}->{:?}", self.name(), old_state, new_state); } else { info!("{} {:?}->{:?}", self.name(), old_state, new_state); } + self.on_state_changed(self, old_state, new_state); old_state } From a0a79f07b4ed2c6c829e282e9aca6f29b1b7eb1e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 3 Apr 2026 16:13:13 +0000 Subject: [PATCH 04/31] fix: translate change_state comment to bilingual Chinese/English Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/cac264ee-7730-4b26-9bd4-dacab5bc93e5 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/coroutine/state.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/coroutine/state.rs b/core/src/coroutine/state.rs index 1b780e1b..ab30488e 100644 --- a/core/src/coroutine/state.rs +++ b/core/src/coroutine/state.rs @@ -18,9 +18,10 @@ where new_state: CoroutineState, ) -> CoroutineState { let old_state = self.state.replace(new_state); - //先打印日志再通知监听器,确保MonitorListener提交的NOTIFY_NODE时间戳 - //在日志输出之后,避免在QEMU等慢速平台上因日志输出耗时超过抢占间隔 - //导致协程被反复抢占无法推进的活锁问题 + //先打印日志再通知监听器,避免在QEMU等慢速平台上的活锁问题 + // Log before notifying listeners so that MonitorListener submits + // the NOTIFY_NODE after the (potentially slow) log I/O completes, + // preventing a preemption live-lock on slow platforms like QEMU. if let CoroutineState::Error(_) = new_state { error!("{} {:?}->{:?}", self.name(), old_state, new_state); } else { From 87d47a6e38f6154bdd20c4cde53cdaa03dc8f659 Mon Sep 17 00:00:00 2001 From: loongs-zhang <1936978077@qq.com> Date: Sat, 4 Apr 2026 00:20:59 +0800 Subject: [PATCH 05/31] fix compile --- core/src/scheduler.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/core/src/scheduler.rs b/core/src/scheduler.rs index 675f1984..d077c003 100644 --- a/core/src/scheduler.rs +++ b/core/src/scheduler.rs @@ -228,7 +228,7 @@ impl<'s> Scheduler<'s> { .expect("change syscall state failed"); } //协程在系统调用执行期间被信号抢占,直接放回就绪队列 - CoroutineState::Syscall(_, _, SyscallState::Executing) => {} + CoroutineState::Syscall((), _, SyscallState::Executing) => {} _ => unreachable!("try_resume unexpect CoroutineState"), } self.ready.push(co); @@ -313,12 +313,17 @@ impl<'s> Scheduler<'s> { _ = RUNNING_COROUTINES.remove(&co_id); })? { CoroutineState::Syscall((), _, state) => { - //挂起协程到系统调用表 - //如果已包含,说明当前系统调用还有上层父系统调用,因此直接忽略插入结果 - _ = self.syscall.insert(co_id, coroutine); - if let SyscallState::Suspend(timestamp) = state { - self.syscall_suspend - .push(SyscallSuspendItem { timestamp, co_id }); + if let SyscallState::Executing = state { + //协程在系统调用执行期间被信号抢占,放回就绪队列以恢复执行 + self.ready.push(coroutine); + } else { + //挂起协程到系统调用表 + //如果已包含,说明当前系统调用还有上层父系统调用,因此直接忽略插入结果 + _ = self.syscall.insert(co_id, coroutine); + if let SyscallState::Suspend(timestamp) = state { + self.syscall_suspend + .push(SyscallSuspendItem { timestamp, co_id }); + } } } CoroutineState::Suspend((), timestamp) => { @@ -395,7 +400,7 @@ impl<'s> Scheduler<'s> { .filter(|entry| { matches!( entry.value().state(), - CoroutineState::Syscall(_, _, SyscallState::Executing) + CoroutineState::Syscall((), _, SyscallState::Executing) ) }) .map(|entry| *entry.key()) From fac116841337a0db67d669e38e3e3d61688e2560 Mon Sep 17 00:00:00 2001 From: loongs-zhang <1936978077@qq.com> Date: Sat, 4 Apr 2026 00:37:14 +0800 Subject: [PATCH 06/31] clean code --- core/src/scheduler.rs | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/core/src/scheduler.rs b/core/src/scheduler.rs index d077c003..c87d25d2 100644 --- a/core/src/scheduler.rs +++ b/core/src/scheduler.rs @@ -227,8 +227,6 @@ impl<'s> Scheduler<'s> { co.syscall(val, syscall, SyscallState::Callback) .expect("change syscall state failed"); } - //协程在系统调用执行期间被信号抢占,直接放回就绪队列 - CoroutineState::Syscall((), _, SyscallState::Executing) => {} _ => unreachable!("try_resume unexpect CoroutineState"), } self.ready.push(co); @@ -391,23 +389,6 @@ impl<'s> Scheduler<'s> { } } } - // Check for coroutines preempted during syscall execution (SIGURG race). - // These have SyscallState::Executing and no syscall_suspend entry, - // so they would be stuck in the syscall map forever without this. - let executing: Vec = self - .syscall - .iter() - .filter(|entry| { - matches!( - entry.value().state(), - CoroutineState::Syscall((), _, SyscallState::Executing) - ) - }) - .map(|entry| *entry.key()) - .collect(); - for co_id in executing { - self.try_resume(co_id); - } Ok(()) } From 726199bf234fbc4846042e5c90477d69069bbeee Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 3 Apr 2026 17:34:11 +0000 Subject: [PATCH 07/31] fix: conditionally reorder change_state to prevent both live-lock and SIGURG during logging For transitions TO Running: log before notifying listeners. MonitorListener starts the 10ms preemption timer on Running state, so logging first ensures the timer starts after the potentially slow I/O completes, preventing the preemption live-lock on QEMU. For all other transitions: notify before logging (original order). MonitorListener removes the NOTIFY_NODE for non-Running states, so removing it first prevents SIGURG from arriving during the log I/O. Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/7e58a7e2-bd00-42ef-8614-002f14778590 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/coroutine/state.rs | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/core/src/coroutine/state.rs b/core/src/coroutine/state.rs index ab30488e..ce1fed20 100644 --- a/core/src/coroutine/state.rs +++ b/core/src/coroutine/state.rs @@ -18,16 +18,25 @@ where new_state: CoroutineState, ) -> CoroutineState { let old_state = self.state.replace(new_state); - //先打印日志再通知监听器,避免在QEMU等慢速平台上的活锁问题 - // Log before notifying listeners so that MonitorListener submits - // the NOTIFY_NODE after the (potentially slow) log I/O completes, - // preventing a preemption live-lock on slow platforms like QEMU. - if let CoroutineState::Error(_) = new_state { - error!("{} {:?}->{:?}", self.name(), old_state, new_state); - } else { + if matches!(new_state, CoroutineState::Running) { + //先打印日志再通知监听器,避免在QEMU等慢速平台上的活锁问题 + // Log before notifying: MonitorListener starts the 10ms preemption + // timer on Running state. On QEMU, info!() takes >10ms, so logging + // first ensures the timer starts after the slow I/O completes. info!("{} {:?}->{:?}", self.name(), old_state, new_state); + self.on_state_changed(self, old_state, new_state); + } else { + //先通知监听器再打印日志,确保抢占定时器在日志I/O之前被移除 + // Notify before logging: MonitorListener removes the NOTIFY_NODE for + // non-Running states. Removing it first prevents SIGURG from arriving + // during the potentially slow log I/O. + self.on_state_changed(self, old_state, new_state); + if let CoroutineState::Error(_) = new_state { + error!("{} {:?}->{:?}", self.name(), old_state, new_state); + } else { + info!("{} {:?}->{:?}", self.name(), old_state, new_state); + } } - self.on_state_changed(self, old_state, new_state); old_state } From 4675117d10f8bbb0d21d5b3125f23ce37563e1d7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 3 Apr 2026 17:55:07 +0000 Subject: [PATCH 08/31] fix: hook write with re-entrancy guard to prevent preemption during logging, revert change_state conditional reorder Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/c679f7fd-4c3d-4a1c-a07d-637ba7b9fc00 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/coroutine/state.rs | 21 ++++----------------- core/src/syscall/unix/mod.rs | 33 +++++++++++++++++++++++++++++++++ hook/src/syscall/unix.rs | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 72 insertions(+), 18 deletions(-) diff --git a/core/src/coroutine/state.rs b/core/src/coroutine/state.rs index ce1fed20..1ba91410 100644 --- a/core/src/coroutine/state.rs +++ b/core/src/coroutine/state.rs @@ -18,24 +18,11 @@ where new_state: CoroutineState, ) -> CoroutineState { let old_state = self.state.replace(new_state); - if matches!(new_state, CoroutineState::Running) { - //先打印日志再通知监听器,避免在QEMU等慢速平台上的活锁问题 - // Log before notifying: MonitorListener starts the 10ms preemption - // timer on Running state. On QEMU, info!() takes >10ms, so logging - // first ensures the timer starts after the slow I/O completes. - info!("{} {:?}->{:?}", self.name(), old_state, new_state); - self.on_state_changed(self, old_state, new_state); + self.on_state_changed(self, old_state, new_state); + if let CoroutineState::Error(_) = new_state { + error!("{} {:?}->{:?}", self.name(), old_state, new_state); } else { - //先通知监听器再打印日志,确保抢占定时器在日志I/O之前被移除 - // Notify before logging: MonitorListener removes the NOTIFY_NODE for - // non-Running states. Removing it first prevents SIGURG from arriving - // during the potentially slow log I/O. - self.on_state_changed(self, old_state, new_state); - if let CoroutineState::Error(_) = new_state { - error!("{} {:?}->{:?}", self.name(), old_state, new_state); - } else { - info!("{} {:?}->{:?}", self.name(), old_state, new_state); - } + info!("{} {:?}->{:?}", self.name(), old_state, new_state); } old_state } diff --git a/core/src/syscall/unix/mod.rs b/core/src/syscall/unix/mod.rs index f1198be4..cda56dee 100644 --- a/core/src/syscall/unix/mod.rs +++ b/core/src/syscall/unix/mod.rs @@ -2,6 +2,25 @@ use dashmap::DashMap; use once_cell::sync::Lazy; use std::ffi::c_int; +//防止重入:info!()/error!()内部会调用write(),如果write被hook了, +//会导致无限递归或嵌套状态转换。当检测到重入时,直接调用内部系统调用跳过facade逻辑。 +// Re-entrancy guard: info!()/error!() internally call write(). If write is hooked, +// this causes infinite recursion or nested state transitions that corrupt coroutine state. +// When re-entrancy is detected, bypass the facade and call the inner syscall directly. +thread_local! { + static IN_FACADE: std::cell::Cell = const { std::cell::Cell::new(false) }; +} + +#[inline] +pub fn in_facade() -> bool { + IN_FACADE.get() +} + +#[inline] +pub fn set_in_facade(val: bool) { + IN_FACADE.set(val); +} + macro_rules! impl_syscall { ( $facade_struct_name:ident, $iocp_struct_name: ident, $nio_struct_name: ident, $raw_struct_name: ident, @@ -97,7 +116,18 @@ macro_rules! impl_facade { fn_ptr: Option<&extern "C" fn($($arg_type),*) -> $result>, $($arg: $arg_type),* ) -> $result { + if $crate::syscall::in_facade() { + return self.inner.$syscall(fn_ptr, $($arg, )*); + } let syscall = $crate::common::constants::SyscallName::$syscall; + //在日志和状态变更期间设置防重入标志,因为info!()/error!()内部会 + //调用write(),co.syscall()/co.running()内部会调用change_state() + //再调用info!(),这些都可能触发hooked write导致无限递归 + // Set re-entrancy guard during logging and state changes because: + // - info!()/error!() internally call write() + // - co.syscall()/co.running() call change_state() which calls info!() + // Both can trigger hooked write causing infinite recursion. + $crate::syscall::set_in_facade(true); $crate::info!("enter syscall {}", syscall); if let Some(co) = $crate::scheduler::SchedulableCoroutine::current() { let new_state = $crate::common::constants::SyscallState::Executing; @@ -107,13 +137,16 @@ macro_rules! impl_facade { ); } } + $crate::syscall::set_in_facade(false); let r = self.inner.$syscall(fn_ptr, $($arg, )*); + $crate::syscall::set_in_facade(true); if let Some(co) = $crate::scheduler::SchedulableCoroutine::current() { if co.running().is_err() { $crate::error!("{} change to running state failed !", co.name()); } } $crate::info!("exit syscall {} {:?} {}", syscall, r, std::io::Error::last_os_error()); + $crate::syscall::set_in_facade(false); r } } diff --git a/hook/src/syscall/unix.rs b/hook/src/syscall/unix.rs index aabd517b..7ad0b5de 100644 --- a/hook/src/syscall/unix.rs +++ b/hook/src/syscall/unix.rs @@ -83,6 +83,40 @@ impl_hook!(RENAMEAT2, renameat2(olddirfd: c_int, oldpath: *const c_char, newdirf // impl_hook!(POLL, poll(fds: *mut pollfd, nfds: nfds_t, timeout: c_int) -> c_int); // NOTE: unhook write/pthread_mutex_lock/pthread_mutex_unlock due to stack overflow or bug -// impl_hook!(WRITE, write(fd: c_int, buf: *const c_void, count: size_t) -> ssize_t); // impl_hook!(PTHREAD_MUTEX_LOCK, pthread_mutex_lock(lock: *mut pthread_mutex_t) -> c_int); // impl_hook!(PTHREAD_MUTEX_UNLOCK, pthread_mutex_unlock(lock: *mut pthread_mutex_t) -> c_int); + +//write需要特殊的hook实现:stdout/stderr的write由日志框架触发, +//必须绕过facade直接调用原始write,否则facade内部的info!()会再次 +//触发write导致stdout RefCell重复借用。其他fd正常走facade。 +// write needs a custom hook: writes to stdout/stderr are triggered by +// the logging framework. They must bypass the facade and call raw write +// directly; otherwise the facade's info!() would re-trigger write, +// causing stdout's RefCell to be double-borrowed. Other fds go through +// the facade normally. +#[no_mangle] +pub extern "C" fn write(fd: c_int, buf: *const c_void, count: size_t) -> ssize_t { + static WRITE: once_cell::sync::Lazy ssize_t> = + once_cell::sync::Lazy::new(|| unsafe { + let symbol = std::ffi::CString::new("write") + .unwrap_or_else(|_| panic!("can not transfer \"write\" to CString")); + let ptr = libc::dlsym(libc::RTLD_NEXT, symbol.as_ptr()); + assert!(!ptr.is_null(), "syscall \"write\" not found !"); + std::mem::transmute(ptr) + }); + let fn_ptr = once_cell::sync::Lazy::force(&WRITE); + // stdout(1)/stderr(2)的write由日志框架触发,必须绕过facade + // Bypass facade for stdout/stderr — these are logging fds + if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO + || open_coroutine_core::syscall::in_facade() + { + return (fn_ptr)(fd, buf, count); + } + if crate::hook() + || open_coroutine_core::scheduler::SchedulableCoroutine::current().is_some() + || cfg!(feature = "ci") + { + return open_coroutine_core::syscall::write(Some(fn_ptr), fd, buf, count); + } + (fn_ptr)(fd, buf, count) +} From ca70cd73b48e3446be5c2e2ddaa40e16e4a9be63 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 00:42:20 +0000 Subject: [PATCH 09/31] refactor: use impl_hook! for write and implement special facade in write.rs Move the stdout/stderr bypass logic from a custom hook in hook/src/syscall/unix.rs to a specialized facade in core/src/syscall/unix/write.rs per maintainer feedback. - hook/src/syscall/unix.rs: Use standard impl_hook!(WRITE, write(...)) macro - core/src/syscall/unix/write.rs: Replace generic impl_facade! with a custom WriteSyscallFacade that bypasses state transitions and logging for: - stdout(fd=1)/stderr(fd=2) writes (triggered by logging framework) - writes during in_facade() re-entrancy (prevents recursive facade entry) Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/6a64e463-d9fb-4d2f-9e86-284f579ee741 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/syscall/unix/write.rs | 59 ++++++++++++++++++++++++++++++++-- hook/src/syscall/unix.rs | 37 ++------------------- 2 files changed, 58 insertions(+), 38 deletions(-) diff --git a/core/src/syscall/unix/write.rs b/core/src/syscall/unix/write.rs index 9449a855..3a9911e9 100644 --- a/core/src/syscall/unix/write.rs +++ b/core/src/syscall/unix/write.rs @@ -15,9 +15,62 @@ impl_syscall!(WriteSyscallFacade, IoUringWriteSyscall, NioWriteSyscall, RawWrite write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t ); -impl_facade!(WriteSyscallFacade, WriteSyscall, - write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t -); +//write的facade需要特殊处理:stdout/stderr的write由日志框架(tracing)触发, +//必须跳过状态转换和日志记录直接调用内层,否则facade内部的info!()会再次 +//触发write导致stdout RefCell重复借用(无限递归)。 +// The write facade needs special handling: writes to stdout/stderr are +// triggered by the logging framework (tracing). They must skip state +// transitions and logging, going directly to the inner layer. Otherwise +// the facade's info!() would re-trigger write, causing stdout's RefCell +// to be double-borrowed (infinite recursion). +#[repr(C)] +#[derive(Debug, Default)] +struct WriteSyscallFacade { + inner: I, +} + +impl WriteSyscall for WriteSyscallFacade { + extern "C" fn write( + &self, + fn_ptr: Option<&extern "C" fn(c_int, *const c_void, size_t) -> ssize_t>, + fd: c_int, + buf: *const c_void, + len: size_t, + ) -> ssize_t { + // stdout(1)/stderr(2)由日志框架触发,或已在facade内部(防重入), + // 直接调用内层跳过状态转换和日志记录 + // Bypass state transitions for stdout/stderr (logging fds) and + // when already inside a facade (re-entrancy guard) + if fd == libc::STDOUT_FILENO + || fd == libc::STDERR_FILENO + || crate::syscall::in_facade() + { + return self.inner.write(fn_ptr, fd, buf, len); + } + let syscall = crate::common::constants::SyscallName::write; + crate::syscall::set_in_facade(true); + crate::info!("enter syscall {}", syscall); + if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { + let new_state = crate::common::constants::SyscallState::Executing; + if co.syscall((), syscall, new_state).is_err() { + crate::error!("{} change to syscall {} {} failed !", + co.name(), syscall, new_state + ); + } + } + crate::syscall::set_in_facade(false); + let r = self.inner.write(fn_ptr, fd, buf, len); + crate::syscall::set_in_facade(true); + if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { + if co.running().is_err() { + crate::error!("{} change to running state failed !", co.name()); + } + } + crate::info!("exit syscall {} {:?} {}", syscall, r, std::io::Error::last_os_error()); + crate::syscall::set_in_facade(false); + r + } +} impl_io_uring_write!(IoUringWriteSyscall, WriteSyscall, write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t diff --git a/hook/src/syscall/unix.rs b/hook/src/syscall/unix.rs index 7ad0b5de..84eb4d06 100644 --- a/hook/src/syscall/unix.rs +++ b/hook/src/syscall/unix.rs @@ -82,41 +82,8 @@ impl_hook!(RENAMEAT2, renameat2(olddirfd: c_int, oldpath: *const c_char, newdirf // NOTE: unhook poll due to mio's poller // impl_hook!(POLL, poll(fds: *mut pollfd, nfds: nfds_t, timeout: c_int) -> c_int); -// NOTE: unhook write/pthread_mutex_lock/pthread_mutex_unlock due to stack overflow or bug +// NOTE: unhook pthread_mutex_lock/pthread_mutex_unlock due to bug // impl_hook!(PTHREAD_MUTEX_LOCK, pthread_mutex_lock(lock: *mut pthread_mutex_t) -> c_int); // impl_hook!(PTHREAD_MUTEX_UNLOCK, pthread_mutex_unlock(lock: *mut pthread_mutex_t) -> c_int); -//write需要特殊的hook实现:stdout/stderr的write由日志框架触发, -//必须绕过facade直接调用原始write,否则facade内部的info!()会再次 -//触发write导致stdout RefCell重复借用。其他fd正常走facade。 -// write needs a custom hook: writes to stdout/stderr are triggered by -// the logging framework. They must bypass the facade and call raw write -// directly; otherwise the facade's info!() would re-trigger write, -// causing stdout's RefCell to be double-borrowed. Other fds go through -// the facade normally. -#[no_mangle] -pub extern "C" fn write(fd: c_int, buf: *const c_void, count: size_t) -> ssize_t { - static WRITE: once_cell::sync::Lazy ssize_t> = - once_cell::sync::Lazy::new(|| unsafe { - let symbol = std::ffi::CString::new("write") - .unwrap_or_else(|_| panic!("can not transfer \"write\" to CString")); - let ptr = libc::dlsym(libc::RTLD_NEXT, symbol.as_ptr()); - assert!(!ptr.is_null(), "syscall \"write\" not found !"); - std::mem::transmute(ptr) - }); - let fn_ptr = once_cell::sync::Lazy::force(&WRITE); - // stdout(1)/stderr(2)的write由日志框架触发,必须绕过facade - // Bypass facade for stdout/stderr — these are logging fds - if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO - || open_coroutine_core::syscall::in_facade() - { - return (fn_ptr)(fd, buf, count); - } - if crate::hook() - || open_coroutine_core::scheduler::SchedulableCoroutine::current().is_some() - || cfg!(feature = "ci") - { - return open_coroutine_core::syscall::write(Some(fn_ptr), fd, buf, count); - } - (fn_ptr)(fd, buf, count) -} +impl_hook!(WRITE, write(fd: c_int, buf: *const c_void, count: size_t) -> ssize_t); From 438499400553bcb506bd1fee77ad47eb60d8a5b3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 00:54:51 +0000 Subject: [PATCH 10/31] fix: reorder impl_facade! to transition state before logging, revert do_schedule redirect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause fix: co.syscall(Executing) is now called BEFORE info!("enter syscall") in both impl_facade! and WriteSyscallFacade. This triggers on_state_changed which removes MonitorListener's NOTIFY_NODE before slow logging, preventing SIGURG from firing while the coroutine is still in Running state. Reverted the do_schedule Syscall(Executing) → ready queue redirect in scheduler.rs since it's no longer needed — the MonitorListener design correctly prevents signals to Syscall-state coroutines when the NOTIFY_NODE is removed before logging. Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/79faeb2b-0a58-4973-8277-f50a34bcb60b Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/scheduler.rs | 17 ++++++----------- core/src/syscall/unix/mod.rs | 18 ++++++++++-------- core/src/syscall/unix/write.rs | 2 +- 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/core/src/scheduler.rs b/core/src/scheduler.rs index c87d25d2..dff9980d 100644 --- a/core/src/scheduler.rs +++ b/core/src/scheduler.rs @@ -311,17 +311,12 @@ impl<'s> Scheduler<'s> { _ = RUNNING_COROUTINES.remove(&co_id); })? { CoroutineState::Syscall((), _, state) => { - if let SyscallState::Executing = state { - //协程在系统调用执行期间被信号抢占,放回就绪队列以恢复执行 - self.ready.push(coroutine); - } else { - //挂起协程到系统调用表 - //如果已包含,说明当前系统调用还有上层父系统调用,因此直接忽略插入结果 - _ = self.syscall.insert(co_id, coroutine); - if let SyscallState::Suspend(timestamp) = state { - self.syscall_suspend - .push(SyscallSuspendItem { timestamp, co_id }); - } + //挂起协程到系统调用表 + //如果已包含,说明当前系统调用还有上层父系统调用,因此直接忽略插入结果 + _ = self.syscall.insert(co_id, coroutine); + if let SyscallState::Suspend(timestamp) = state { + self.syscall_suspend + .push(SyscallSuspendItem { timestamp, co_id }); } } CoroutineState::Suspend((), timestamp) => { diff --git a/core/src/syscall/unix/mod.rs b/core/src/syscall/unix/mod.rs index cda56dee..00b0c019 100644 --- a/core/src/syscall/unix/mod.rs +++ b/core/src/syscall/unix/mod.rs @@ -120,15 +120,16 @@ macro_rules! impl_facade { return self.inner.$syscall(fn_ptr, $($arg, )*); } let syscall = $crate::common::constants::SyscallName::$syscall; - //在日志和状态变更期间设置防重入标志,因为info!()/error!()内部会 - //调用write(),co.syscall()/co.running()内部会调用change_state() - //再调用info!(),这些都可能触发hooked write导致无限递归 - // Set re-entrancy guard during logging and state changes because: - // - info!()/error!() internally call write() - // - co.syscall()/co.running() call change_state() which calls info!() - // Both can trigger hooked write causing infinite recursion. + //先转换状态再记录日志:co.syscall(Executing)会通过on_state_changed + //移除MonitorListener的NOTIFY_NODE,使monitor不再发送SIGURG。 + //如果先调用info!()再转换状态,在QEMU等慢平台上info!()可能耗时>10ms, + //导致SIGURG在协程还处于Running状态时被发送,造成抢占活锁。 + // Transition state BEFORE logging: co.syscall(Executing) triggers + // on_state_changed which removes MonitorListener's NOTIFY_NODE, + // preventing the monitor from sending SIGURG. If info!() is called + // first while still in Running state, it can take >10ms on slow + // platforms (QEMU), causing SIGURG to fire and preemption live-lock. $crate::syscall::set_in_facade(true); - $crate::info!("enter syscall {}", syscall); if let Some(co) = $crate::scheduler::SchedulableCoroutine::current() { let new_state = $crate::common::constants::SyscallState::Executing; if co.syscall((), syscall, new_state).is_err() { @@ -137,6 +138,7 @@ macro_rules! impl_facade { ); } } + $crate::info!("enter syscall {}", syscall); $crate::syscall::set_in_facade(false); let r = self.inner.$syscall(fn_ptr, $($arg, )*); $crate::syscall::set_in_facade(true); diff --git a/core/src/syscall/unix/write.rs b/core/src/syscall/unix/write.rs index 3a9911e9..d1c86eb4 100644 --- a/core/src/syscall/unix/write.rs +++ b/core/src/syscall/unix/write.rs @@ -49,7 +49,6 @@ impl WriteSyscall for WriteSyscallFacade { } let syscall = crate::common::constants::SyscallName::write; crate::syscall::set_in_facade(true); - crate::info!("enter syscall {}", syscall); if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { let new_state = crate::common::constants::SyscallState::Executing; if co.syscall((), syscall, new_state).is_err() { @@ -58,6 +57,7 @@ impl WriteSyscall for WriteSyscallFacade { ); } } + crate::info!("enter syscall {}", syscall); crate::syscall::set_in_facade(false); let r = self.inner.write(fn_ptr, fd, buf, len); crate::syscall::set_in_facade(true); From c49454d67e277b8ae79c196715d74a410fe2ffde Mon Sep 17 00:00:00 2001 From: loongs-zhang <1936978077@qq.com> Date: Sat, 4 Apr 2026 09:01:11 +0800 Subject: [PATCH 11/31] move code --- hook/src/syscall/unix.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hook/src/syscall/unix.rs b/hook/src/syscall/unix.rs index 84eb4d06..c39de345 100644 --- a/hook/src/syscall/unix.rs +++ b/hook/src/syscall/unix.rs @@ -62,6 +62,7 @@ impl_hook!(PREADV, preadv(fd: c_int, iov: *const iovec, iovcnt: c_int, offset: o impl_hook!(RECVMSG, recvmsg(fd: c_int, msg: *mut msghdr, flags: c_int) -> ssize_t); impl_hook!(SEND, send(fd: c_int, buf: *const c_void, len: size_t, flags: c_int) -> ssize_t); impl_hook!(SENDTO, sendto(fd: c_int, buf: *const c_void, len: size_t, flags: c_int, addr: *const sockaddr, addrlen: socklen_t) -> ssize_t); +impl_hook!(WRITE, write(fd: c_int, buf: *const c_void, count: size_t) -> ssize_t); impl_hook!(PWRITE, pwrite(fd: c_int, buf: *const c_void, count: size_t, offset: off_t) -> ssize_t); impl_hook!(WRITEV, writev(fd: c_int, iov: *const iovec, iovcnt: c_int) -> ssize_t); impl_hook!(PWRITEV, pwritev(fd: c_int, iov: *const iovec, iovcnt: c_int, offset: off_t) -> ssize_t); @@ -85,5 +86,3 @@ impl_hook!(RENAMEAT2, renameat2(olddirfd: c_int, oldpath: *const c_char, newdirf // NOTE: unhook pthread_mutex_lock/pthread_mutex_unlock due to bug // impl_hook!(PTHREAD_MUTEX_LOCK, pthread_mutex_lock(lock: *mut pthread_mutex_t) -> c_int); // impl_hook!(PTHREAD_MUTEX_UNLOCK, pthread_mutex_unlock(lock: *mut pthread_mutex_t) -> c_int); - -impl_hook!(WRITE, write(fd: c_int, buf: *const c_void, count: size_t) -> ssize_t); From 7e7f4f520b8a1778a80b52dd6c792ae7901265f2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 01:10:30 +0000 Subject: [PATCH 12/31] refactor: move IN_FACADE re-entrancy guard from mod.rs to write.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The IN_FACADE thread-local and its accessors (in_facade/set_in_facade) are specific to the write hook's re-entrancy prevention. Moved from the generic impl_facade! macro in mod.rs to write.rs where they belong. The generic impl_facade! no longer has IN_FACADE logic — it keeps the state-transition-before-logging reorder for the preemption fix, while write.rs handles its own re-entrancy guard privately. Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/481d5b61-d175-4f75-ade3-a6f0b5511a71 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/syscall/unix/mod.rs | 26 -------------------------- core/src/syscall/unix/write.rs | 29 ++++++++++++++++++++++++----- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/core/src/syscall/unix/mod.rs b/core/src/syscall/unix/mod.rs index 00b0c019..fdb4e969 100644 --- a/core/src/syscall/unix/mod.rs +++ b/core/src/syscall/unix/mod.rs @@ -2,25 +2,6 @@ use dashmap::DashMap; use once_cell::sync::Lazy; use std::ffi::c_int; -//防止重入:info!()/error!()内部会调用write(),如果write被hook了, -//会导致无限递归或嵌套状态转换。当检测到重入时,直接调用内部系统调用跳过facade逻辑。 -// Re-entrancy guard: info!()/error!() internally call write(). If write is hooked, -// this causes infinite recursion or nested state transitions that corrupt coroutine state. -// When re-entrancy is detected, bypass the facade and call the inner syscall directly. -thread_local! { - static IN_FACADE: std::cell::Cell = const { std::cell::Cell::new(false) }; -} - -#[inline] -pub fn in_facade() -> bool { - IN_FACADE.get() -} - -#[inline] -pub fn set_in_facade(val: bool) { - IN_FACADE.set(val); -} - macro_rules! impl_syscall { ( $facade_struct_name:ident, $iocp_struct_name: ident, $nio_struct_name: ident, $raw_struct_name: ident, @@ -116,9 +97,6 @@ macro_rules! impl_facade { fn_ptr: Option<&extern "C" fn($($arg_type),*) -> $result>, $($arg: $arg_type),* ) -> $result { - if $crate::syscall::in_facade() { - return self.inner.$syscall(fn_ptr, $($arg, )*); - } let syscall = $crate::common::constants::SyscallName::$syscall; //先转换状态再记录日志:co.syscall(Executing)会通过on_state_changed //移除MonitorListener的NOTIFY_NODE,使monitor不再发送SIGURG。 @@ -129,7 +107,6 @@ macro_rules! impl_facade { // preventing the monitor from sending SIGURG. If info!() is called // first while still in Running state, it can take >10ms on slow // platforms (QEMU), causing SIGURG to fire and preemption live-lock. - $crate::syscall::set_in_facade(true); if let Some(co) = $crate::scheduler::SchedulableCoroutine::current() { let new_state = $crate::common::constants::SyscallState::Executing; if co.syscall((), syscall, new_state).is_err() { @@ -139,16 +116,13 @@ macro_rules! impl_facade { } } $crate::info!("enter syscall {}", syscall); - $crate::syscall::set_in_facade(false); let r = self.inner.$syscall(fn_ptr, $($arg, )*); - $crate::syscall::set_in_facade(true); if let Some(co) = $crate::scheduler::SchedulableCoroutine::current() { if co.running().is_err() { $crate::error!("{} change to running state failed !", co.name()); } } $crate::info!("exit syscall {} {:?} {}", syscall, r, std::io::Error::last_os_error()); - $crate::syscall::set_in_facade(false); r } } diff --git a/core/src/syscall/unix/write.rs b/core/src/syscall/unix/write.rs index d1c86eb4..2ed38f71 100644 --- a/core/src/syscall/unix/write.rs +++ b/core/src/syscall/unix/write.rs @@ -1,6 +1,25 @@ use libc::{size_t, ssize_t}; use std::ffi::{c_int, c_void}; +//防止重入:info!()/error!()内部会调用write(),如果write被hook了, +//会导致无限递归或嵌套状态转换。当检测到重入时,直接调用内部系统调用跳过facade逻辑。 +// Re-entrancy guard: info!()/error!() internally call write(). If write is hooked, +// this causes infinite recursion or nested state transitions that corrupt coroutine state. +// When re-entrancy is detected, bypass the facade and call the inner syscall directly. +thread_local! { + static IN_FACADE: std::cell::Cell = const { std::cell::Cell::new(false) }; +} + +#[inline] +fn in_facade() -> bool { + IN_FACADE.get() +} + +#[inline] +fn set_in_facade(val: bool) { + IN_FACADE.set(val); +} + trait WriteSyscall { extern "C" fn write( &self, @@ -43,12 +62,12 @@ impl WriteSyscall for WriteSyscallFacade { // when already inside a facade (re-entrancy guard) if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO - || crate::syscall::in_facade() + || in_facade() { return self.inner.write(fn_ptr, fd, buf, len); } let syscall = crate::common::constants::SyscallName::write; - crate::syscall::set_in_facade(true); + set_in_facade(true); if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { let new_state = crate::common::constants::SyscallState::Executing; if co.syscall((), syscall, new_state).is_err() { @@ -58,16 +77,16 @@ impl WriteSyscall for WriteSyscallFacade { } } crate::info!("enter syscall {}", syscall); - crate::syscall::set_in_facade(false); + set_in_facade(false); let r = self.inner.write(fn_ptr, fd, buf, len); - crate::syscall::set_in_facade(true); + set_in_facade(true); if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { if co.running().is_err() { crate::error!("{} change to running state failed !", co.name()); } } crate::info!("exit syscall {} {:?} {}", syscall, r, std::io::Error::last_os_error()); - crate::syscall::set_in_facade(false); + set_in_facade(false); r } } From dd487c28fbe109e216b6ff398243afb0a7c71d64 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 01:12:53 +0000 Subject: [PATCH 13/31] fix: prevent SIGURG preemption live-lock by hooking write and reordering facade state transitions Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/481d5b61-d175-4f75-ade3-a6f0b5511a71 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/scheduler.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/core/src/scheduler.rs b/core/src/scheduler.rs index dff9980d..675f1984 100644 --- a/core/src/scheduler.rs +++ b/core/src/scheduler.rs @@ -227,6 +227,8 @@ impl<'s> Scheduler<'s> { co.syscall(val, syscall, SyscallState::Callback) .expect("change syscall state failed"); } + //协程在系统调用执行期间被信号抢占,直接放回就绪队列 + CoroutineState::Syscall(_, _, SyscallState::Executing) => {} _ => unreachable!("try_resume unexpect CoroutineState"), } self.ready.push(co); @@ -384,6 +386,23 @@ impl<'s> Scheduler<'s> { } } } + // Check for coroutines preempted during syscall execution (SIGURG race). + // These have SyscallState::Executing and no syscall_suspend entry, + // so they would be stuck in the syscall map forever without this. + let executing: Vec = self + .syscall + .iter() + .filter(|entry| { + matches!( + entry.value().state(), + CoroutineState::Syscall(_, _, SyscallState::Executing) + ) + }) + .map(|entry| *entry.key()) + .collect(); + for co_id in executing { + self.try_resume(co_id); + } Ok(()) } From 56f1551627ebe51fc4cc5fd2e9d0d203bdfb4fa7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 01:23:50 +0000 Subject: [PATCH 14/31] revert: rollback all scheduler.rs changes per reviewer feedback The write.rs WriteSyscallFacade fix properly handles the preemption issue by reordering state transitions before logging. With NOTIFY_NODE removed before slow logging I/O, MonitorListener will never send SIGURG to coroutines entering Syscall state, so the scheduler doesn't need special Syscall(Executing) handling. Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/d22ce767-7985-43b3-a8bf-d77128655007 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/scheduler.rs | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/core/src/scheduler.rs b/core/src/scheduler.rs index 675f1984..dff9980d 100644 --- a/core/src/scheduler.rs +++ b/core/src/scheduler.rs @@ -227,8 +227,6 @@ impl<'s> Scheduler<'s> { co.syscall(val, syscall, SyscallState::Callback) .expect("change syscall state failed"); } - //协程在系统调用执行期间被信号抢占,直接放回就绪队列 - CoroutineState::Syscall(_, _, SyscallState::Executing) => {} _ => unreachable!("try_resume unexpect CoroutineState"), } self.ready.push(co); @@ -386,23 +384,6 @@ impl<'s> Scheduler<'s> { } } } - // Check for coroutines preempted during syscall execution (SIGURG race). - // These have SyscallState::Executing and no syscall_suspend entry, - // so they would be stuck in the syscall map forever without this. - let executing: Vec = self - .syscall - .iter() - .filter(|entry| { - matches!( - entry.value().state(), - CoroutineState::Syscall(_, _, SyscallState::Executing) - ) - }) - .map(|entry| *entry.key()) - .collect(); - for co_id in executing { - self.try_resume(co_id); - } Ok(()) } From b0d19488b8441cbf011c2672cd77ffcba023e65c Mon Sep 17 00:00:00 2001 From: loongs-zhang <1936978077@qq.com> Date: Sat, 4 Apr 2026 09:50:50 +0800 Subject: [PATCH 15/31] reorder --- core/src/coroutine/state.rs | 5 ++++- core/src/syscall/windows/mod.rs | 11 ++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/core/src/coroutine/state.rs b/core/src/coroutine/state.rs index 1ba91410..1b780e1b 100644 --- a/core/src/coroutine/state.rs +++ b/core/src/coroutine/state.rs @@ -18,12 +18,15 @@ where new_state: CoroutineState, ) -> CoroutineState { let old_state = self.state.replace(new_state); - self.on_state_changed(self, old_state, new_state); + //先打印日志再通知监听器,确保MonitorListener提交的NOTIFY_NODE时间戳 + //在日志输出之后,避免在QEMU等慢速平台上因日志输出耗时超过抢占间隔 + //导致协程被反复抢占无法推进的活锁问题 if let CoroutineState::Error(_) = new_state { error!("{} {:?}->{:?}", self.name(), old_state, new_state); } else { info!("{} {:?}->{:?}", self.name(), old_state, new_state); } + self.on_state_changed(self, old_state, new_state); old_state } diff --git a/core/src/syscall/windows/mod.rs b/core/src/syscall/windows/mod.rs index d3f418b8..ca07d632 100644 --- a/core/src/syscall/windows/mod.rs +++ b/core/src/syscall/windows/mod.rs @@ -76,7 +76,15 @@ macro_rules! impl_facade { $($arg: $arg_type),* ) -> $result { let syscall = $crate::common::constants::SyscallName::$syscall; - $crate::info!("enter syscall {}", syscall); + //先转换状态再记录日志:co.syscall(Executing)会通过on_state_changed + //移除MonitorListener的NOTIFY_NODE,使monitor不再发送SIGURG。 + //如果先调用info!()再转换状态,在QEMU等慢平台上info!()可能耗时>10ms, + //导致SIGURG在协程还处于Running状态时被发送,造成抢占活锁。 + // Transition state BEFORE logging: co.syscall(Executing) triggers + // on_state_changed which removes MonitorListener's NOTIFY_NODE, + // preventing the monitor from sending SIGURG. If info!() is called + // first while still in Running state, it can take >10ms on slow + // platforms (QEMU), causing SIGURG to fire and preemption live-lock. if let Some(co) = $crate::scheduler::SchedulableCoroutine::current() { let new_state = $crate::common::constants::SyscallState::Executing; if co.syscall((), syscall, new_state).is_err() { @@ -85,6 +93,7 @@ macro_rules! impl_facade { ); } } + $crate::info!("enter syscall {}", syscall); let r = self.inner.$syscall(fn_ptr, $($arg, )*); // Save errno immediately—logging and coroutine bookkeeping // call Win32 APIs (e.g. CreateFileW) that clobber GetLastError(). From adf74986468f5f1d9279ca7629f60f7d7dc9b198 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 02:17:01 +0000 Subject: [PATCH 16/31] fix: bypass all layers (io_uring/NIO) in WriteSyscallFacade for stdout/stderr/re-entrant writes The bypass path was calling self.inner.write() which goes through IoUring/NIO layers. For stdout/stderr writes (logging), the IoUring layer submits writes to io_uring and blocks on condvar waiting for completion - causing deadlocks when no event loop is processing completions (e.g., main thread, or during change_state() logging). Fix: call the raw function pointer (fn_ptr) or libc::write directly in the bypass path, skipping all intermediate layers. Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/61190877-cc39-4de0-b0a8-1c34758e9e66 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/syscall/unix/write.rs | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/core/src/syscall/unix/write.rs b/core/src/syscall/unix/write.rs index 2ed38f71..de0ac785 100644 --- a/core/src/syscall/unix/write.rs +++ b/core/src/syscall/unix/write.rs @@ -2,10 +2,12 @@ use libc::{size_t, ssize_t}; use std::ffi::{c_int, c_void}; //防止重入:info!()/error!()内部会调用write(),如果write被hook了, -//会导致无限递归或嵌套状态转换。当检测到重入时,直接调用内部系统调用跳过facade逻辑。 +//会导致无限递归或嵌套状态转换。当检测到重入时,直接调用原始系统调用跳过 +//所有中间层(io_uring/NIO),避免io_uring提交导致condvar死锁。 // Re-entrancy guard: info!()/error!() internally call write(). If write is hooked, // this causes infinite recursion or nested state transitions that corrupt coroutine state. -// When re-entrancy is detected, bypass the facade and call the inner syscall directly. +// When re-entrancy is detected, bypass ALL layers (io_uring, NIO, facade) and call +// the raw syscall directly to avoid io_uring submission deadlocks. thread_local! { static IN_FACADE: std::cell::Cell = const { std::cell::Cell::new(false) }; } @@ -35,13 +37,14 @@ impl_syscall!(WriteSyscallFacade, IoUringWriteSyscall, NioWriteSyscall, RawWrite ); //write的facade需要特殊处理:stdout/stderr的write由日志框架(tracing)触发, -//必须跳过状态转换和日志记录直接调用内层,否则facade内部的info!()会再次 -//触发write导致stdout RefCell重复借用(无限递归)。 +//必须跳过所有中间层(facade/io_uring/NIO)直接调用原始系统调用,否则: +//1. facade内部的info!()会再次触发write导致stdout RefCell重复借用(无限递归) +//2. io_uring层会提交写操作并阻塞在condvar等待完成,导致死锁 // The write facade needs special handling: writes to stdout/stderr are -// triggered by the logging framework (tracing). They must skip state -// transitions and logging, going directly to the inner layer. Otherwise -// the facade's info!() would re-trigger write, causing stdout's RefCell -// to be double-borrowed (infinite recursion). +// triggered by the logging framework (tracing). They must bypass ALL layers +// (facade, io_uring, NIO) and call the raw syscall directly. Otherwise: +// 1. The facade's info!() re-triggers write → stdout RefCell double-borrow +// 2. The io_uring layer submits the write and blocks on condvar → deadlock #[repr(C)] #[derive(Debug, Default)] struct WriteSyscallFacade { @@ -57,14 +60,19 @@ impl WriteSyscall for WriteSyscallFacade { len: size_t, ) -> ssize_t { // stdout(1)/stderr(2)由日志框架触发,或已在facade内部(防重入), - // 直接调用内层跳过状态转换和日志记录 - // Bypass state transitions for stdout/stderr (logging fds) and - // when already inside a facade (re-entrancy guard) + // 直接调用原始系统调用,跳过所有中间层(io_uring/NIO)避免死锁 + // Bypass ALL layers for stdout/stderr (logging fds) and when already + // inside a facade (re-entrancy guard). Call raw syscall directly to + // avoid io_uring submission deadlocks and NIO event loop interactions. if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO || in_facade() { - return self.inner.write(fn_ptr, fd, buf, len); + return if let Some(f) = fn_ptr { + (f)(fd, buf, len) + } else { + unsafe { libc::write(fd, buf, len) } + }; } let syscall = crate::common::constants::SyscallName::write; set_in_facade(true); From 95378a2776d28f898712cb7ae54b60d52fd31f8c Mon Sep 17 00:00:00 2001 From: loongs-zhang <1936978077@qq.com> Date: Sat, 4 Apr 2026 10:51:40 +0800 Subject: [PATCH 17/31] code polish --- core/src/syscall/unix/write.rs | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/core/src/syscall/unix/write.rs b/core/src/syscall/unix/write.rs index de0ac785..482055aa 100644 --- a/core/src/syscall/unix/write.rs +++ b/core/src/syscall/unix/write.rs @@ -1,6 +1,20 @@ use libc::{size_t, ssize_t}; use std::ffi::{c_int, c_void}; +trait WriteSyscall { + extern "C" fn write( + &self, + fn_ptr: Option<&extern "C" fn(c_int, *const c_void, size_t) -> ssize_t>, + fd: c_int, + buf: *const c_void, + len: size_t, + ) -> ssize_t; +} + +impl_syscall!(WriteSyscallFacade, IoUringWriteSyscall, NioWriteSyscall, RawWriteSyscall, + write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t +); + //防止重入:info!()/error!()内部会调用write(),如果write被hook了, //会导致无限递归或嵌套状态转换。当检测到重入时,直接调用原始系统调用跳过 //所有中间层(io_uring/NIO),避免io_uring提交导致condvar死锁。 @@ -22,20 +36,6 @@ fn set_in_facade(val: bool) { IN_FACADE.set(val); } -trait WriteSyscall { - extern "C" fn write( - &self, - fn_ptr: Option<&extern "C" fn(c_int, *const c_void, size_t) -> ssize_t>, - fd: c_int, - buf: *const c_void, - len: size_t, - ) -> ssize_t; -} - -impl_syscall!(WriteSyscallFacade, IoUringWriteSyscall, NioWriteSyscall, RawWriteSyscall, - write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t -); - //write的facade需要特殊处理:stdout/stderr的write由日志框架(tracing)触发, //必须跳过所有中间层(facade/io_uring/NIO)直接调用原始系统调用,否则: //1. facade内部的info!()会再次触发write导致stdout RefCell重复借用(无限递归) @@ -68,11 +68,7 @@ impl WriteSyscall for WriteSyscallFacade { || fd == libc::STDERR_FILENO || in_facade() { - return if let Some(f) = fn_ptr { - (f)(fd, buf, len) - } else { - unsafe { libc::write(fd, buf, len) } - }; + return RawWriteSyscall::default().write(fn_ptr, fd, buf, len); } let syscall = crate::common::constants::SyscallName::write; set_in_facade(true); From d0ff4dd19d0fb78f160ac807ed690abb57bfedcb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 03:20:20 +0000 Subject: [PATCH 18/31] fix: add early bypass in write() dispatcher for stdout/stderr and revert change_state to original order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The preemptive test failed on armv7 QEMU because the write hook added cumulative overhead to every info!() → write() call in the syscall facade chain. With 5+ info!() calls per nanosleep cycle, the overhead exceeded the 10ms preemption window, causing coroutines to be preempted during println!() while holding the stdout lock, leading to lock contention live-lock. Two changes: 1. write.rs: Replace impl_syscall! with a custom write() dispatcher that bypasses the entire facade chain (WriteSyscallFacade/IoUring/NIO) for stdout/stderr/in_facade writes at the top level, minimizing function call overhead per info!() invocation. 2. state.rs: Revert change_state() to the original order (on_state_changed before info). The impl_facade! reorder (state transition before logging) already handles the file_co live-lock by removing NOTIFY_NODE before slow logging. The change_state reorder is no longer needed and caused the NOTIFY_NODE to be submitted too late, leaving insufficient margin for NOTIFY_NODE removal during →Syscall transitions on slow platforms. Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/a357ac51-d143-416c-ba5b-69f43d42453e Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/coroutine/state.rs | 5 +--- core/src/syscall/unix/write.rs | 49 ++++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/core/src/coroutine/state.rs b/core/src/coroutine/state.rs index 1b780e1b..1ba91410 100644 --- a/core/src/coroutine/state.rs +++ b/core/src/coroutine/state.rs @@ -18,15 +18,12 @@ where new_state: CoroutineState, ) -> CoroutineState { let old_state = self.state.replace(new_state); - //先打印日志再通知监听器,确保MonitorListener提交的NOTIFY_NODE时间戳 - //在日志输出之后,避免在QEMU等慢速平台上因日志输出耗时超过抢占间隔 - //导致协程被反复抢占无法推进的活锁问题 + self.on_state_changed(self, old_state, new_state); if let CoroutineState::Error(_) = new_state { error!("{} {:?}->{:?}", self.name(), old_state, new_state); } else { info!("{} {:?}->{:?}", self.name(), old_state, new_state); } - self.on_state_changed(self, old_state, new_state); old_state } diff --git a/core/src/syscall/unix/write.rs b/core/src/syscall/unix/write.rs index 482055aa..067de0f8 100644 --- a/core/src/syscall/unix/write.rs +++ b/core/src/syscall/unix/write.rs @@ -11,9 +11,41 @@ trait WriteSyscall { ) -> ssize_t; } -impl_syscall!(WriteSyscallFacade, IoUringWriteSyscall, NioWriteSyscall, RawWriteSyscall, - write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t -); +//在最顶层对stdout/stderr/重入写入做早期旁路:直接调用原始系统调用, +//跳过整个facade链(WriteSyscallFacade/IoUring/NIO),最小化每次info!() +//调用write()时的函数调用开销。在QEMU等慢速平台上,每个额外的函数调用 +//可能耗时0.5-1ms,累积的开销会导致协程在10ms抢占窗口内无法完成工作。 +// Early bypass at the top-level dispatcher for stdout/stderr/re-entrant writes: +// call the raw syscall directly, skipping the entire facade chain +// (WriteSyscallFacade/IoUring/NIO). This minimizes function call overhead +// per info!() → write() invocation. On slow platforms (QEMU), each extra +// function call can cost 0.5-1ms, and cumulative overhead prevents coroutines +// from completing work within the 10ms preemption window. +#[must_use] +pub extern "C" fn write( + fn_ptr: Option<&extern "C" fn(c_int, *const c_void, size_t) -> ssize_t>, + fd: c_int, + buf: *const c_void, + len: size_t, +) -> ssize_t { + if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO || in_facade() { + if let Some(f) = fn_ptr { + return (f)(fd, buf, len); + } + return unsafe { libc::write(fd, buf, len) }; + } + cfg_if::cfg_if! { + if #[cfg(all(target_os = "linux", feature = "io_uring"))] { + static CHAIN: once_cell::sync::Lazy< + WriteSyscallFacade>> + > = once_cell::sync::Lazy::new(Default::default); + } else { + static CHAIN: once_cell::sync::Lazy>> = + once_cell::sync::Lazy::new(Default::default); + } + } + CHAIN.write(fn_ptr, fd, buf, len) +} //防止重入:info!()/error!()内部会调用write(),如果write被hook了, //会导致无限递归或嵌套状态转换。当检测到重入时,直接调用原始系统调用跳过 @@ -59,17 +91,6 @@ impl WriteSyscall for WriteSyscallFacade { buf: *const c_void, len: size_t, ) -> ssize_t { - // stdout(1)/stderr(2)由日志框架触发,或已在facade内部(防重入), - // 直接调用原始系统调用,跳过所有中间层(io_uring/NIO)避免死锁 - // Bypass ALL layers for stdout/stderr (logging fds) and when already - // inside a facade (re-entrancy guard). Call raw syscall directly to - // avoid io_uring submission deadlocks and NIO event loop interactions. - if fd == libc::STDOUT_FILENO - || fd == libc::STDERR_FILENO - || in_facade() - { - return RawWriteSyscall::default().write(fn_ptr, fd, buf, len); - } let syscall = crate::common::constants::SyscallName::write; set_in_facade(true); if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { From 3b8cf285a58575c72e6b7deaf3094bf55851882b Mon Sep 17 00:00:00 2001 From: loongs-zhang <1936978077@qq.com> Date: Sat, 4 Apr 2026 13:16:59 +0800 Subject: [PATCH 19/31] clean code --- core/src/syscall/unix/write.rs | 78 +++------------------------------- 1 file changed, 5 insertions(+), 73 deletions(-) diff --git a/core/src/syscall/unix/write.rs b/core/src/syscall/unix/write.rs index 067de0f8..70b0e8bd 100644 --- a/core/src/syscall/unix/write.rs +++ b/core/src/syscall/unix/write.rs @@ -28,11 +28,8 @@ pub extern "C" fn write( buf: *const c_void, len: size_t, ) -> ssize_t { - if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO || in_facade() { - if let Some(f) = fn_ptr { - return (f)(fd, buf, len); - } - return unsafe { libc::write(fd, buf, len) }; + if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO { + return RawWriteSyscall::default().write(fn_ptr, fd, buf, len); } cfg_if::cfg_if! { if #[cfg(all(target_os = "linux", feature = "io_uring"))] { @@ -47,74 +44,9 @@ pub extern "C" fn write( CHAIN.write(fn_ptr, fd, buf, len) } -//防止重入:info!()/error!()内部会调用write(),如果write被hook了, -//会导致无限递归或嵌套状态转换。当检测到重入时,直接调用原始系统调用跳过 -//所有中间层(io_uring/NIO),避免io_uring提交导致condvar死锁。 -// Re-entrancy guard: info!()/error!() internally call write(). If write is hooked, -// this causes infinite recursion or nested state transitions that corrupt coroutine state. -// When re-entrancy is detected, bypass ALL layers (io_uring, NIO, facade) and call -// the raw syscall directly to avoid io_uring submission deadlocks. -thread_local! { - static IN_FACADE: std::cell::Cell = const { std::cell::Cell::new(false) }; -} - -#[inline] -fn in_facade() -> bool { - IN_FACADE.get() -} - -#[inline] -fn set_in_facade(val: bool) { - IN_FACADE.set(val); -} - -//write的facade需要特殊处理:stdout/stderr的write由日志框架(tracing)触发, -//必须跳过所有中间层(facade/io_uring/NIO)直接调用原始系统调用,否则: -//1. facade内部的info!()会再次触发write导致stdout RefCell重复借用(无限递归) -//2. io_uring层会提交写操作并阻塞在condvar等待完成,导致死锁 -// The write facade needs special handling: writes to stdout/stderr are -// triggered by the logging framework (tracing). They must bypass ALL layers -// (facade, io_uring, NIO) and call the raw syscall directly. Otherwise: -// 1. The facade's info!() re-triggers write → stdout RefCell double-borrow -// 2. The io_uring layer submits the write and blocks on condvar → deadlock -#[repr(C)] -#[derive(Debug, Default)] -struct WriteSyscallFacade { - inner: I, -} - -impl WriteSyscall for WriteSyscallFacade { - extern "C" fn write( - &self, - fn_ptr: Option<&extern "C" fn(c_int, *const c_void, size_t) -> ssize_t>, - fd: c_int, - buf: *const c_void, - len: size_t, - ) -> ssize_t { - let syscall = crate::common::constants::SyscallName::write; - set_in_facade(true); - if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { - let new_state = crate::common::constants::SyscallState::Executing; - if co.syscall((), syscall, new_state).is_err() { - crate::error!("{} change to syscall {} {} failed !", - co.name(), syscall, new_state - ); - } - } - crate::info!("enter syscall {}", syscall); - set_in_facade(false); - let r = self.inner.write(fn_ptr, fd, buf, len); - set_in_facade(true); - if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { - if co.running().is_err() { - crate::error!("{} change to running state failed !", co.name()); - } - } - crate::info!("exit syscall {} {:?} {}", syscall, r, std::io::Error::last_os_error()); - set_in_facade(false); - r - } -} +impl_facade!(WriteSyscallFacade, WriteSyscall, + write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t +); impl_io_uring_write!(IoUringWriteSyscall, WriteSyscall, write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t From f2a219ab73d8ba2ea9f05abbb54261cb3a459d7d Mon Sep 17 00:00:00 2001 From: loongs-zhang <1936978077@qq.com> Date: Sat, 4 Apr 2026 13:19:03 +0800 Subject: [PATCH 20/31] clean code --- core/src/syscall/unix/write.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/syscall/unix/write.rs b/core/src/syscall/unix/write.rs index 70b0e8bd..a2e9bc89 100644 --- a/core/src/syscall/unix/write.rs +++ b/core/src/syscall/unix/write.rs @@ -28,9 +28,6 @@ pub extern "C" fn write( buf: *const c_void, len: size_t, ) -> ssize_t { - if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO { - return RawWriteSyscall::default().write(fn_ptr, fd, buf, len); - } cfg_if::cfg_if! { if #[cfg(all(target_os = "linux", feature = "io_uring"))] { static CHAIN: once_cell::sync::Lazy< @@ -41,6 +38,9 @@ pub extern "C" fn write( once_cell::sync::Lazy::new(Default::default); } } + if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO { + return RawWriteSyscall::default().write(fn_ptr, fd, buf, len); + } CHAIN.write(fn_ptr, fd, buf, len) } From 5ddf2effc948c3b9fd8e497126b8ab29ae171487 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 05:51:22 +0000 Subject: [PATCH 21/31] =?UTF-8?q?fix:=20conditionally=20reorder=20change?= =?UTF-8?q?=5Fstate=20for=20=E2=86=92Running=20to=20prevent=20live-lock=20?= =?UTF-8?q?on=20QEMU?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/4da6c3b7-ffcf-483e-a492-a3fead1a48d2 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/coroutine/state.rs | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/core/src/coroutine/state.rs b/core/src/coroutine/state.rs index 1ba91410..c54f4c51 100644 --- a/core/src/coroutine/state.rs +++ b/core/src/coroutine/state.rs @@ -18,11 +18,26 @@ where new_state: CoroutineState, ) -> CoroutineState { let old_state = self.state.replace(new_state); - self.on_state_changed(self, old_state, new_state); - if let CoroutineState::Error(_) = new_state { - error!("{} {:?}->{:?}", self.name(), old_state, new_state); - } else { + //对→Running的转换:先记录日志再通知MonitorListener。 + //on_state_changed(Running)会通过MonitorListener设置10ms的NOTIFY_NODE定时器, + //如果先通知再记录日志,在QEMU等慢平台上info!()可能耗时>10ms, + //导致定时器在日志记录期间过期→SIGURG→抢占活锁。 + //先记录日志确保NOTIFY_NODE定时器在日志I/O完成后才启动。 + // For →Running transitions: log BEFORE notifying MonitorListener. + // on_state_changed(Running) sets a 10ms NOTIFY_NODE timer via MonitorListener. + // If notified first, info!() can take >10ms on slow platforms (QEMU), + // causing the timer to expire during logging → SIGURG → preemption live-lock. + // Logging first ensures the NOTIFY_NODE timer starts after slow I/O completes. + if matches!(new_state, CoroutineState::Running) { info!("{} {:?}->{:?}", self.name(), old_state, new_state); + self.on_state_changed(self, old_state, new_state); + } else { + self.on_state_changed(self, old_state, new_state); + if let CoroutineState::Error(_) = new_state { + error!("{} {:?}->{:?}", self.name(), old_state, new_state); + } else { + info!("{} {:?}->{:?}", self.name(), old_state, new_state); + } } old_state } From 6ad23ff929ea724c2c6d1b1175c5e390ec4b8f0e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 07:10:01 +0000 Subject: [PATCH 22/31] fix: skip SIGURG preemption for Syscall-state coroutines to prevent deadlock A race between NOTIFY_NODE removal (in on_state_changed) and the monitor's queue iteration can deliver SIGURG after the coroutine entered Syscall state. Preempting a Syscall(Executing) coroutine puts it in the syscall map with no io_uring/epoll/timer registration to wake it, causing a deadlock on slow QEMU platforms. Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/ba73092e-273c-4063-a297-a7ed471b9cba Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/monitor.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/core/src/monitor.rs b/core/src/monitor.rs index 502ecbdf..1bfe3fba 100644 --- a/core/src/monitor.rs +++ b/core/src/monitor.rs @@ -78,6 +78,24 @@ impl Monitor { set.remove(Signal::SIGURG); set.thread_set_mask() .expect("Failed to remove SIGURG signal mask!"); + //不抢占处于Syscall状态的协程。 + //MonitorListener的设计理念是不对Syscall状态的协程发送信号。 + //但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件, + //SIGURG可能在协程刚进入Syscall状态时到达。 + //如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册), + //导致死锁。 + // Skip preemption for coroutines in Syscall state. + // MonitorListener's design is to NOT send signals to Syscall-state + // coroutines. However, a race between NOTIFY_NODE removal and the + // monitor's queue iteration can cause SIGURG to arrive just after + // the coroutine entered Syscall state. If preempted here, the + // coroutine lands in the syscall map with no io_uring/epoll/timer + // registration to wake it, causing a deadlock. + if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { + if matches!(co.state(), CoroutineState::Syscall(_, _, _)) { + return; + } + } if let Some(suspender) = SchedulableSuspender::current() { suspender.suspend(); } From 0178f4e7c21647bfadfe8ea474d6334a54a46adc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 10:25:01 +0000 Subject: [PATCH 23/31] fix: move Syscall guard from sigurg_handler to monitor_thread_main, check notify_queue.contains(node) before pthread_kill Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/35c147b1-dc85-43d5-8b59-57f6e2e6eb3d Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/monitor.rs | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/core/src/monitor.rs b/core/src/monitor.rs index 1bfe3fba..48bc6226 100644 --- a/core/src/monitor.rs +++ b/core/src/monitor.rs @@ -78,24 +78,6 @@ impl Monitor { set.remove(Signal::SIGURG); set.thread_set_mask() .expect("Failed to remove SIGURG signal mask!"); - //不抢占处于Syscall状态的协程。 - //MonitorListener的设计理念是不对Syscall状态的协程发送信号。 - //但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件, - //SIGURG可能在协程刚进入Syscall状态时到达。 - //如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册), - //导致死锁。 - // Skip preemption for coroutines in Syscall state. - // MonitorListener's design is to NOT send signals to Syscall-state - // coroutines. However, a race between NOTIFY_NODE removal and the - // monitor's queue iteration can cause SIGURG to arrive just after - // the coroutine entered Syscall state. If preempted here, the - // coroutine lands in the syscall map with no io_uring/epoll/timer - // registration to wake it, causing a deadlock. - if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { - if matches!(co.state(), CoroutineState::Syscall(_, _, _)) { - return; - } - } if let Some(suspender) = SchedulableSuspender::current() { suspender.suspend(); } @@ -185,6 +167,15 @@ impl Monitor { } //实际上只对陷入重度计算的协程发送信号抢占 //对于陷入执行系统调用的协程不发送信号(如果发送信号,会打断系统调用,进而降低总体性能) + //当协程进入Syscall状态时,on_state_changed会从notify_queue中移除节点。 + //此处检查节点是否仍在队列中,避免对已进入Syscall状态的协程发送信号。 + // When a coroutine enters Syscall state, on_state_changed removes + // its node from the notify_queue. Check that the node is still + // present before sending the signal, to avoid preempting a + // coroutine that has already entered Syscall state. + if !notify_queue.contains(node) { + continue; + } cfg_if::cfg_if! { if #[cfg(unix)] { if pthread_kill(node.pthread, Signal::SIGURG).is_err() { From 38e9537be3aede8f005f08f67188e3af7d827186 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 11:15:06 +0000 Subject: [PATCH 24/31] fix: replace logically-wrong contains() with two-phase snapshot approach in monitor_thread_main Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/86fbc9f6-ebe9-4a42-95f6-c047dbb579d8 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/monitor.rs | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/core/src/monitor.rs b/core/src/monitor.rs index 48bc6226..4a5198f7 100644 --- a/core/src/monitor.rs +++ b/core/src/monitor.rs @@ -161,18 +161,17 @@ impl Monitor { let notify_queue = unsafe { &*monitor.notify_queue.get() }; while MonitorState::Running == monitor.state.get() || !notify_queue.is_empty() { //只遍历,不删除,如果抢占调度失败,会在1ms后不断重试,相当于主动检测 - for node in notify_queue { - if now() < node.timestamp { - continue; - } + //先收集超时节点快照,再逐个检查是否仍在队列中 + //(在收集和检查之间,on_state_changed可能已将节点移除——协程进入了Syscall状态) + let current = now(); + let expired: Vec = notify_queue + .iter() + .filter(|n| current >= n.timestamp) + .copied() + .collect(); + for node in &expired { //实际上只对陷入重度计算的协程发送信号抢占 //对于陷入执行系统调用的协程不发送信号(如果发送信号,会打断系统调用,进而降低总体性能) - //当协程进入Syscall状态时,on_state_changed会从notify_queue中移除节点。 - //此处检查节点是否仍在队列中,避免对已进入Syscall状态的协程发送信号。 - // When a coroutine enters Syscall state, on_state_changed removes - // its node from the notify_queue. Check that the node is still - // present before sending the signal, to avoid preempting a - // coroutine that has already entered Syscall state. if !notify_queue.contains(node) { continue; } From 438c29eacb2a6f79bfc253ebbcbd5bce1b80c3f2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 11:19:14 +0000 Subject: [PATCH 25/31] perf: pre-allocate expired Vec outside monitor loop to avoid per-iteration allocation Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/86fbc9f6-ebe9-4a42-95f6-c047dbb579d8 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/monitor.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/core/src/monitor.rs b/core/src/monitor.rs index 4a5198f7..319a5c7b 100644 --- a/core/src/monitor.rs +++ b/core/src/monitor.rs @@ -159,16 +159,14 @@ impl Monitor { let monitor = Self::get_instance(); Self::init_current(monitor); let notify_queue = unsafe { &*monitor.notify_queue.get() }; + //先收集超时节点快照,再逐个检查是否仍在队列中 + //(在收集和检查之间,on_state_changed可能已将节点移除——协程进入了Syscall状态) + let mut expired = Vec::new(); while MonitorState::Running == monitor.state.get() || !notify_queue.is_empty() { //只遍历,不删除,如果抢占调度失败,会在1ms后不断重试,相当于主动检测 - //先收集超时节点快照,再逐个检查是否仍在队列中 - //(在收集和检查之间,on_state_changed可能已将节点移除——协程进入了Syscall状态) + expired.clear(); let current = now(); - let expired: Vec = notify_queue - .iter() - .filter(|n| current >= n.timestamp) - .copied() - .collect(); + expired.extend(notify_queue.iter().filter(|n| current >= n.timestamp).copied()); for node in &expired { //实际上只对陷入重度计算的协程发送信号抢占 //对于陷入执行系统调用的协程不发送信号(如果发送信号,会打断系统调用,进而降低总体性能) From c62a624ca0bdf56af80fa119084e11bb2265c7af Mon Sep 17 00:00:00 2001 From: loongs-zhang <1936978077@qq.com> Date: Sun, 5 Apr 2026 19:32:06 +0800 Subject: [PATCH 26/31] enhance stability --- core/src/monitor.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/src/monitor.rs b/core/src/monitor.rs index 319a5c7b..537a079f 100644 --- a/core/src/monitor.rs +++ b/core/src/monitor.rs @@ -166,7 +166,12 @@ impl Monitor { //只遍历,不删除,如果抢占调度失败,会在1ms后不断重试,相当于主动检测 expired.clear(); let current = now(); - expired.extend(notify_queue.iter().filter(|n| current >= n.timestamp).copied()); + expired.extend( + notify_queue + .iter() + .filter(|n| current >= n.timestamp) + .copied(), + ); for node in &expired { //实际上只对陷入重度计算的协程发送信号抢占 //对于陷入执行系统调用的协程不发送信号(如果发送信号,会打断系统调用,进而降低总体性能) From 6ac5bf1d4dbe1859fc00a03eca9ff854a7712a86 Mon Sep 17 00:00:00 2001 From: loongs-zhang <1936978077@qq.com> Date: Mon, 6 Apr 2026 09:46:22 +0800 Subject: [PATCH 27/31] enhance stability --- core/src/monitor.rs | 42 ++++++++++++++++-- core/src/syscall/unix/write.rs | 80 ++++++++++++++++++++-------------- hook/src/syscall/unix.rs | 2 +- 3 files changed, 87 insertions(+), 37 deletions(-) diff --git a/core/src/monitor.rs b/core/src/monitor.rs index 537a079f..4bdcd2d9 100644 --- a/core/src/monitor.rs +++ b/core/src/monitor.rs @@ -3,7 +3,7 @@ use crate::common::constants::{CoroutineState, MONITOR_BEAN}; use crate::common::{get_timeout_time, now, CondvarBlocker}; use crate::coroutine::listener::Listener; use crate::coroutine::local::CoroutineLocal; -use crate::scheduler::SchedulableSuspender; +use crate::scheduler::{SchedulableCoroutine, SchedulableSuspender}; use crate::{catch, error, impl_current_for, impl_display_by_debug, info}; #[cfg(unix)] use nix::sys::pthread::{pthread_kill, pthread_self, Pthread}; @@ -78,6 +78,24 @@ impl Monitor { set.remove(Signal::SIGURG); set.thread_set_mask() .expect("Failed to remove SIGURG signal mask!"); + //不抢占处于Syscall状态的协程。 + //MonitorListener的设计理念是不对Syscall状态的协程发送信号。 + //但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件, + //SIGURG可能在协程刚进入Syscall状态时到达。 + //如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册), + //导致死锁。 + // Skip preemption for coroutines in Syscall state. + // MonitorListener's design is to NOT send signals to Syscall-state + // coroutines. However, a race between NOTIFY_NODE removal and the + // monitor's queue iteration can cause SIGURG to arrive just after + // the coroutine entered Syscall state. If preempted here, the + // coroutine lands in the syscall map with no io_uring/epoll/timer + // registration to wake it, causing a deadlock. + if let Some(co) = SchedulableCoroutine::current() { + if matches!(co.state(), CoroutineState::Syscall((), _, _)) { + return; + } + } if let Some(suspender) = SchedulableSuspender::current() { suspender.suspend(); } @@ -89,7 +107,7 @@ impl Monitor { // install panic hook std::panic::set_hook(Box::new(|panic_hook_info| { let syscall = crate::common::constants::SyscallName::panicking; - if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { + if let Some(co) = SchedulableCoroutine::current() { let new_state = crate::common::constants::SyscallState::Executing; if co.syscall((), syscall, new_state).is_err() { error!( @@ -109,7 +127,7 @@ impl Monitor { "stack backtrace:\n{}", std::backtrace::Backtrace::force_capture() ); - if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { + if let Some(co) = SchedulableCoroutine::current() { if co.running().is_err() { error!("{} change to running state failed !", co.name()); } @@ -534,6 +552,24 @@ extern "C" fn do_preempt() { // coroutine never yielded (no hooked syscalls) — it is truly CPU-bound. // Force immediate suspension. flag.set(false); + //不抢占处于Syscall状态的协程。 + //MonitorListener的设计理念是不对Syscall状态的协程发送信号。 + //但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件, + //SIGURG可能在协程刚进入Syscall状态时到达。 + //如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册), + //导致死锁。 + // Skip preemption for coroutines in Syscall state. + // MonitorListener's design is to NOT send signals to Syscall-state + // coroutines. However, a race between NOTIFY_NODE removal and the + // monitor's queue iteration can cause SIGURG to arrive just after + // the coroutine entered Syscall state. If preempted here, the + // coroutine lands in the syscall map with no io_uring/epoll/timer + // registration to wake it, causing a deadlock. + if let Some(co) = SchedulableCoroutine::current() { + if matches!(co.state(), CoroutineState::Syscall((), _, _)) { + return; + } + } if let Some(suspender) = SchedulableSuspender::current() { suspender.suspend(); } diff --git a/core/src/syscall/unix/write.rs b/core/src/syscall/unix/write.rs index a2e9bc89..4171a812 100644 --- a/core/src/syscall/unix/write.rs +++ b/core/src/syscall/unix/write.rs @@ -11,43 +11,57 @@ trait WriteSyscall { ) -> ssize_t; } -//在最顶层对stdout/stderr/重入写入做早期旁路:直接调用原始系统调用, -//跳过整个facade链(WriteSyscallFacade/IoUring/NIO),最小化每次info!() -//调用write()时的函数调用开销。在QEMU等慢速平台上,每个额外的函数调用 -//可能耗时0.5-1ms,累积的开销会导致协程在10ms抢占窗口内无法完成工作。 -// Early bypass at the top-level dispatcher for stdout/stderr/re-entrant writes: -// call the raw syscall directly, skipping the entire facade chain -// (WriteSyscallFacade/IoUring/NIO). This minimizes function call overhead -// per info!() → write() invocation. On slow platforms (QEMU), each extra -// function call can cost 0.5-1ms, and cumulative overhead prevents coroutines -// from completing work within the 10ms preemption window. -#[must_use] -pub extern "C" fn write( - fn_ptr: Option<&extern "C" fn(c_int, *const c_void, size_t) -> ssize_t>, - fd: c_int, - buf: *const c_void, - len: size_t, -) -> ssize_t { - cfg_if::cfg_if! { - if #[cfg(all(target_os = "linux", feature = "io_uring"))] { - static CHAIN: once_cell::sync::Lazy< - WriteSyscallFacade>> - > = once_cell::sync::Lazy::new(Default::default); - } else { - static CHAIN: once_cell::sync::Lazy>> = - once_cell::sync::Lazy::new(Default::default); +impl_syscall!(WriteSyscallFacade, IoUringWriteSyscall, NioWriteSyscall, RawWriteSyscall, + write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t +); + +//write的facade需要特殊处理:stdout/stderr的write由日志框架(tracing)触发, +//必须跳过所有中间层(facade/io_uring/NIO)直接调用原始系统调用,否则: +//1. facade内部的info!()会再次触发write导致stdout RefCell重复借用(无限递归) +//2. io_uring层会提交写操作并阻塞在condvar等待完成,导致死锁 +// The write facade needs special handling: writes to stdout/stderr are +// triggered by the logging framework (tracing). They must bypass ALL layers +// (facade, io_uring, NIO) and call the raw syscall directly. Otherwise: +// 1. The facade's info!() re-triggers write → stdout RefCell double-borrow +// 2. The io_uring layer submits the write and blocks on condvar → deadlock +#[repr(C)] +#[derive(Debug, Default)] +struct WriteSyscallFacade { + inner: I, +} + +impl WriteSyscall for WriteSyscallFacade { + extern "C" fn write( + &self, + fn_ptr: Option<&extern "C" fn(c_int, *const c_void, size_t) -> ssize_t>, + fd: c_int, + buf: *const c_void, + len: size_t, + ) -> ssize_t { + if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO { + return RawWriteSyscall::default().write(fn_ptr, fd, buf, len); } + let syscall = crate::common::constants::SyscallName::write; + crate::info!("enter syscall {}", syscall); + if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { + let new_state = crate::common::constants::SyscallState::Executing; + if co.syscall((), syscall, new_state).is_err() { + crate::error!("{} change to syscall {} {} failed !", + co.name(), syscall, new_state + ); + } + } + let r = self.inner.write(fn_ptr, fd, buf, len); + if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { + if co.running().is_err() { + crate::error!("{} change to running state failed !", co.name()); + } + } + crate::info!("exit syscall {} {:?} {}", syscall, r, std::io::Error::last_os_error()); + r } - if fd == libc::STDOUT_FILENO || fd == libc::STDERR_FILENO { - return RawWriteSyscall::default().write(fn_ptr, fd, buf, len); - } - CHAIN.write(fn_ptr, fd, buf, len) } -impl_facade!(WriteSyscallFacade, WriteSyscall, - write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t -); - impl_io_uring_write!(IoUringWriteSyscall, WriteSyscall, write(fd: c_int, buf: *const c_void, len: size_t) -> ssize_t ); diff --git a/hook/src/syscall/unix.rs b/hook/src/syscall/unix.rs index c39de345..14682cbc 100644 --- a/hook/src/syscall/unix.rs +++ b/hook/src/syscall/unix.rs @@ -83,6 +83,6 @@ impl_hook!(RENAMEAT2, renameat2(olddirfd: c_int, oldpath: *const c_char, newdirf // NOTE: unhook poll due to mio's poller // impl_hook!(POLL, poll(fds: *mut pollfd, nfds: nfds_t, timeout: c_int) -> c_int); -// NOTE: unhook pthread_mutex_lock/pthread_mutex_unlock due to bug +// NOTE: unhook pthread_mutex_lock/pthread_mutex_unlock due to stack overflow or bug // impl_hook!(PTHREAD_MUTEX_LOCK, pthread_mutex_lock(lock: *mut pthread_mutex_t) -> c_int); // impl_hook!(PTHREAD_MUTEX_UNLOCK, pthread_mutex_unlock(lock: *mut pthread_mutex_t) -> c_int); From 61c06af8e8b3c6b9873c0ca00de0d2f00a9ff42f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 6 Apr 2026 02:57:50 +0000 Subject: [PATCH 28/31] fix: revert change_state conditional reorder and sigurg_handler Syscall check, reorder WriteSyscallFacade to match impl_facade! Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/6b5b9353-541a-4b6b-9636-2fdee16b53fd Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/coroutine/state.rs | 23 ++++------------------- core/src/monitor.rs | 18 ------------------ core/src/syscall/unix/write.rs | 2 +- 3 files changed, 5 insertions(+), 38 deletions(-) diff --git a/core/src/coroutine/state.rs b/core/src/coroutine/state.rs index c54f4c51..1ba91410 100644 --- a/core/src/coroutine/state.rs +++ b/core/src/coroutine/state.rs @@ -18,26 +18,11 @@ where new_state: CoroutineState, ) -> CoroutineState { let old_state = self.state.replace(new_state); - //对→Running的转换:先记录日志再通知MonitorListener。 - //on_state_changed(Running)会通过MonitorListener设置10ms的NOTIFY_NODE定时器, - //如果先通知再记录日志,在QEMU等慢平台上info!()可能耗时>10ms, - //导致定时器在日志记录期间过期→SIGURG→抢占活锁。 - //先记录日志确保NOTIFY_NODE定时器在日志I/O完成后才启动。 - // For →Running transitions: log BEFORE notifying MonitorListener. - // on_state_changed(Running) sets a 10ms NOTIFY_NODE timer via MonitorListener. - // If notified first, info!() can take >10ms on slow platforms (QEMU), - // causing the timer to expire during logging → SIGURG → preemption live-lock. - // Logging first ensures the NOTIFY_NODE timer starts after slow I/O completes. - if matches!(new_state, CoroutineState::Running) { - info!("{} {:?}->{:?}", self.name(), old_state, new_state); - self.on_state_changed(self, old_state, new_state); + self.on_state_changed(self, old_state, new_state); + if let CoroutineState::Error(_) = new_state { + error!("{} {:?}->{:?}", self.name(), old_state, new_state); } else { - self.on_state_changed(self, old_state, new_state); - if let CoroutineState::Error(_) = new_state { - error!("{} {:?}->{:?}", self.name(), old_state, new_state); - } else { - info!("{} {:?}->{:?}", self.name(), old_state, new_state); - } + info!("{} {:?}->{:?}", self.name(), old_state, new_state); } old_state } diff --git a/core/src/monitor.rs b/core/src/monitor.rs index 4bdcd2d9..83b57e6f 100644 --- a/core/src/monitor.rs +++ b/core/src/monitor.rs @@ -78,24 +78,6 @@ impl Monitor { set.remove(Signal::SIGURG); set.thread_set_mask() .expect("Failed to remove SIGURG signal mask!"); - //不抢占处于Syscall状态的协程。 - //MonitorListener的设计理念是不对Syscall状态的协程发送信号。 - //但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件, - //SIGURG可能在协程刚进入Syscall状态时到达。 - //如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册), - //导致死锁。 - // Skip preemption for coroutines in Syscall state. - // MonitorListener's design is to NOT send signals to Syscall-state - // coroutines. However, a race between NOTIFY_NODE removal and the - // monitor's queue iteration can cause SIGURG to arrive just after - // the coroutine entered Syscall state. If preempted here, the - // coroutine lands in the syscall map with no io_uring/epoll/timer - // registration to wake it, causing a deadlock. - if let Some(co) = SchedulableCoroutine::current() { - if matches!(co.state(), CoroutineState::Syscall((), _, _)) { - return; - } - } if let Some(suspender) = SchedulableSuspender::current() { suspender.suspend(); } diff --git a/core/src/syscall/unix/write.rs b/core/src/syscall/unix/write.rs index 4171a812..aba8c212 100644 --- a/core/src/syscall/unix/write.rs +++ b/core/src/syscall/unix/write.rs @@ -42,7 +42,6 @@ impl WriteSyscall for WriteSyscallFacade { return RawWriteSyscall::default().write(fn_ptr, fd, buf, len); } let syscall = crate::common::constants::SyscallName::write; - crate::info!("enter syscall {}", syscall); if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { let new_state = crate::common::constants::SyscallState::Executing; if co.syscall((), syscall, new_state).is_err() { @@ -51,6 +50,7 @@ impl WriteSyscall for WriteSyscallFacade { ); } } + crate::info!("enter syscall {}", syscall); let r = self.inner.write(fn_ptr, fd, buf, len); if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { if co.running().is_err() { From 3b2f5d29631b497a62bb9e9b48cf05efb5ad2ebb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 6 Apr 2026 06:07:01 +0000 Subject: [PATCH 29/31] fix: restore sigurg_handler Syscall state check to match master, preventing TOCTOU race deadlock Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/e64e3187-7a57-474c-92b2-30473acb5596 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/monitor.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/core/src/monitor.rs b/core/src/monitor.rs index 83b57e6f..4bdcd2d9 100644 --- a/core/src/monitor.rs +++ b/core/src/monitor.rs @@ -78,6 +78,24 @@ impl Monitor { set.remove(Signal::SIGURG); set.thread_set_mask() .expect("Failed to remove SIGURG signal mask!"); + //不抢占处于Syscall状态的协程。 + //MonitorListener的设计理念是不对Syscall状态的协程发送信号。 + //但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件, + //SIGURG可能在协程刚进入Syscall状态时到达。 + //如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册), + //导致死锁。 + // Skip preemption for coroutines in Syscall state. + // MonitorListener's design is to NOT send signals to Syscall-state + // coroutines. However, a race between NOTIFY_NODE removal and the + // monitor's queue iteration can cause SIGURG to arrive just after + // the coroutine entered Syscall state. If preempted here, the + // coroutine lands in the syscall map with no io_uring/epoll/timer + // registration to wake it, causing a deadlock. + if let Some(co) = SchedulableCoroutine::current() { + if matches!(co.state(), CoroutineState::Syscall((), _, _)) { + return; + } + } if let Some(suspender) = SchedulableSuspender::current() { suspender.suspend(); } From 570c6798aa3308174eb35517c4c2edb7aea73137 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 6 Apr 2026 06:26:03 +0000 Subject: [PATCH 30/31] =?UTF-8?q?fix:=20revert=20impl=5Ffacade!/WriteSysca?= =?UTF-8?q?llFacade=20reorder=20to=20match=20master=20=E2=80=94=20redundan?= =?UTF-8?q?t=20with=20WriteSyscallFacade=20stdout/stderr=20bypass=20+=20si?= =?UTF-8?q?gurg=5Fhandler=20Syscall=20check?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent-Logs-Url: https://github.com/acl-dev/open-coroutine/sessions/f18791b5-26dd-4ea8-9b97-7f0413fa3459 Co-authored-by: loongs-zhang <38336731+loongs-zhang@users.noreply.github.com> --- core/src/syscall/unix/mod.rs | 11 +---------- core/src/syscall/unix/write.rs | 2 +- core/src/syscall/windows/mod.rs | 11 +---------- 3 files changed, 3 insertions(+), 21 deletions(-) diff --git a/core/src/syscall/unix/mod.rs b/core/src/syscall/unix/mod.rs index fdb4e969..f1198be4 100644 --- a/core/src/syscall/unix/mod.rs +++ b/core/src/syscall/unix/mod.rs @@ -98,15 +98,7 @@ macro_rules! impl_facade { $($arg: $arg_type),* ) -> $result { let syscall = $crate::common::constants::SyscallName::$syscall; - //先转换状态再记录日志:co.syscall(Executing)会通过on_state_changed - //移除MonitorListener的NOTIFY_NODE,使monitor不再发送SIGURG。 - //如果先调用info!()再转换状态,在QEMU等慢平台上info!()可能耗时>10ms, - //导致SIGURG在协程还处于Running状态时被发送,造成抢占活锁。 - // Transition state BEFORE logging: co.syscall(Executing) triggers - // on_state_changed which removes MonitorListener's NOTIFY_NODE, - // preventing the monitor from sending SIGURG. If info!() is called - // first while still in Running state, it can take >10ms on slow - // platforms (QEMU), causing SIGURG to fire and preemption live-lock. + $crate::info!("enter syscall {}", syscall); if let Some(co) = $crate::scheduler::SchedulableCoroutine::current() { let new_state = $crate::common::constants::SyscallState::Executing; if co.syscall((), syscall, new_state).is_err() { @@ -115,7 +107,6 @@ macro_rules! impl_facade { ); } } - $crate::info!("enter syscall {}", syscall); let r = self.inner.$syscall(fn_ptr, $($arg, )*); if let Some(co) = $crate::scheduler::SchedulableCoroutine::current() { if co.running().is_err() { diff --git a/core/src/syscall/unix/write.rs b/core/src/syscall/unix/write.rs index aba8c212..4171a812 100644 --- a/core/src/syscall/unix/write.rs +++ b/core/src/syscall/unix/write.rs @@ -42,6 +42,7 @@ impl WriteSyscall for WriteSyscallFacade { return RawWriteSyscall::default().write(fn_ptr, fd, buf, len); } let syscall = crate::common::constants::SyscallName::write; + crate::info!("enter syscall {}", syscall); if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { let new_state = crate::common::constants::SyscallState::Executing; if co.syscall((), syscall, new_state).is_err() { @@ -50,7 +51,6 @@ impl WriteSyscall for WriteSyscallFacade { ); } } - crate::info!("enter syscall {}", syscall); let r = self.inner.write(fn_ptr, fd, buf, len); if let Some(co) = crate::scheduler::SchedulableCoroutine::current() { if co.running().is_err() { diff --git a/core/src/syscall/windows/mod.rs b/core/src/syscall/windows/mod.rs index ca07d632..d3f418b8 100644 --- a/core/src/syscall/windows/mod.rs +++ b/core/src/syscall/windows/mod.rs @@ -76,15 +76,7 @@ macro_rules! impl_facade { $($arg: $arg_type),* ) -> $result { let syscall = $crate::common::constants::SyscallName::$syscall; - //先转换状态再记录日志:co.syscall(Executing)会通过on_state_changed - //移除MonitorListener的NOTIFY_NODE,使monitor不再发送SIGURG。 - //如果先调用info!()再转换状态,在QEMU等慢平台上info!()可能耗时>10ms, - //导致SIGURG在协程还处于Running状态时被发送,造成抢占活锁。 - // Transition state BEFORE logging: co.syscall(Executing) triggers - // on_state_changed which removes MonitorListener's NOTIFY_NODE, - // preventing the monitor from sending SIGURG. If info!() is called - // first while still in Running state, it can take >10ms on slow - // platforms (QEMU), causing SIGURG to fire and preemption live-lock. + $crate::info!("enter syscall {}", syscall); if let Some(co) = $crate::scheduler::SchedulableCoroutine::current() { let new_state = $crate::common::constants::SyscallState::Executing; if co.syscall((), syscall, new_state).is_err() { @@ -93,7 +85,6 @@ macro_rules! impl_facade { ); } } - $crate::info!("enter syscall {}", syscall); let r = self.inner.$syscall(fn_ptr, $($arg, )*); // Save errno immediately—logging and coroutine bookkeeping // call Win32 APIs (e.g. CreateFileW) that clobber GetLastError(). From c2a87c9427ab0afc6961ba8227c1cfa299fe7a87 Mon Sep 17 00:00:00 2001 From: loongs-zhang <1936978077@qq.com> Date: Mon, 6 Apr 2026 14:51:22 +0800 Subject: [PATCH 31/31] less syscall --- core/src/monitor.rs | 91 +++++++++++++-------------------------------- 1 file changed, 25 insertions(+), 66 deletions(-) diff --git a/core/src/monitor.rs b/core/src/monitor.rs index 4bdcd2d9..a324434b 100644 --- a/core/src/monitor.rs +++ b/core/src/monitor.rs @@ -74,16 +74,10 @@ impl Monitor { #[cfg(unix)] extern "C" fn sigurg_handler(_: libc::c_int) { if let Ok(mut set) = SigSet::thread_get_mask() { - //删除对SIGURG信号的屏蔽,使信号处理函数即使在处理中,也可以再次进入信号处理函数 - set.remove(Signal::SIGURG); - set.thread_set_mask() - .expect("Failed to remove SIGURG signal mask!"); - //不抢占处于Syscall状态的协程。 - //MonitorListener的设计理念是不对Syscall状态的协程发送信号。 + //MonitorListener的设计理念是只对Running状态的协程发送信号。 //但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件, //SIGURG可能在协程刚进入Syscall状态时到达。 - //如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册), - //导致死锁。 + //如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册), 导致死锁。 // Skip preemption for coroutines in Syscall state. // MonitorListener's design is to NOT send signals to Syscall-state // coroutines. However, a race between NOTIFY_NODE removal and the @@ -92,10 +86,14 @@ impl Monitor { // coroutine lands in the syscall map with no io_uring/epoll/timer // registration to wake it, causing a deadlock. if let Some(co) = SchedulableCoroutine::current() { - if matches!(co.state(), CoroutineState::Syscall((), _, _)) { + if !matches!(co.state(), CoroutineState::Running) { return; } } + //删除对SIGURG信号的屏蔽,使信号处理函数即使在处理中,也可以再次进入信号处理函数 + set.remove(Signal::SIGURG); + set.thread_set_mask() + .expect("Failed to remove SIGURG signal mask!"); if let Some(suspender) = SchedulableSuspender::current() { suspender.suspend(); } @@ -530,58 +528,27 @@ std::arch::global_asm!( "ret", ); -// Thread-local flag for two-level preemption on Windows. -// Level 1: SuspendThread fires, do_preempt sets this flag and returns -// without switching coroutines — the thread continues executing -// and exits any critical section (heap allocation, IO, etc.). -// If it reaches a hooked syscall, the Nio/Iocp layer will call -// Suspender::suspend_with cooperatively. -// Level 2: If the flag is still set on the next SuspendThread (~1ms later), -// the coroutine is truly CPU-bound with no syscalls — do_preempt -// forces an immediate context switch. -#[cfg(windows)] -thread_local! { - static PREEMPT_PENDING: Cell = const { Cell::new(false) }; -} - #[cfg(windows)] extern "C" fn do_preempt() { - PREEMPT_PENDING.with(|flag| { - if flag.get() { - // Flag was already set from a previous SuspendThread attempt but the - // coroutine never yielded (no hooked syscalls) — it is truly CPU-bound. - // Force immediate suspension. - flag.set(false); - //不抢占处于Syscall状态的协程。 - //MonitorListener的设计理念是不对Syscall状态的协程发送信号。 - //但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件, - //SIGURG可能在协程刚进入Syscall状态时到达。 - //如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册), - //导致死锁。 - // Skip preemption for coroutines in Syscall state. - // MonitorListener's design is to NOT send signals to Syscall-state - // coroutines. However, a race between NOTIFY_NODE removal and the - // monitor's queue iteration can cause SIGURG to arrive just after - // the coroutine entered Syscall state. If preempted here, the - // coroutine lands in the syscall map with no io_uring/epoll/timer - // registration to wake it, causing a deadlock. - if let Some(co) = SchedulableCoroutine::current() { - if matches!(co.state(), CoroutineState::Syscall((), _, _)) { - return; - } - } - if let Some(suspender) = SchedulableSuspender::current() { - suspender.suspend(); - } - } else { - // First attempt: set the flag and return without suspending. - // preempt_asm will restore all registers and return to the original - // code. This gives the thread time to exit any critical section. - // If the coroutine reaches a hooked syscall, the Nio/Iocp layer - // will yield cooperatively via Suspender::suspend_with. - flag.set(true); + //MonitorListener的设计理念是只对Running状态的协程发送信号。 + //但由于NOTIFY_NODE移除和monitor线程遍历之间存在竞态条件, + //SIGURG可能在协程刚进入Syscall状态时到达。 + //如果此时抢占,协程会被放入syscall_map但无人唤醒(因为没有io_uring/epoll注册),导致死锁。 + // Skip preemption for coroutines in Syscall state. + // MonitorListener's design is to NOT send signals to Syscall-state + // coroutines. However, a race between NOTIFY_NODE removal and the + // monitor's queue iteration can cause SIGURG to arrive just after + // the coroutine entered Syscall state. If preempted here, the + // coroutine lands in the syscall map with no io_uring/epoll/timer + // registration to wake it, causing a deadlock. + if let Some(co) = SchedulableCoroutine::current() { + if !matches!(co.state(), CoroutineState::Running) { + return; } - }); + } + if let Some(suspender) = SchedulableSuspender::current() { + suspender.suspend(); + } } #[repr(C)] @@ -701,14 +668,6 @@ mod tests { assert_ne!(thread_id, 0, "Thread should have reported its ID"); // Directly call preempt_thread to preempt the running coroutine. - // Two-level preemption: the first call sets a cooperative flag (the - // coroutine continues running), the second call forces suspension. - assert!( - super::Monitor::preempt_thread(thread_id), - "preempt_thread should succeed (set cooperative flag)" - ); - // Allow the first preempt_asm to complete before the second call - std::thread::sleep(Duration::from_millis(1)); assert!( super::Monitor::preempt_thread(thread_id), "preempt_thread should succeed (force suspend)"