From c32193f13e612cf9d189eefa07aeaed2449a7de2 Mon Sep 17 00:00:00 2001 From: jvsena42 Date: Tue, 27 Jan 2026 11:17:00 -0300 Subject: [PATCH 1/3] fix: start and stop race condition --- .../to/bitkit/repositories/LightningRepo.kt | 49 +++++++++++++++---- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/app/src/main/java/to/bitkit/repositories/LightningRepo.kt b/app/src/main/java/to/bitkit/repositories/LightningRepo.kt index f44ee9eec..a2bd68db0 100644 --- a/app/src/main/java/to/bitkit/repositories/LightningRepo.kt +++ b/app/src/main/java/to/bitkit/repositories/LightningRepo.kt @@ -25,6 +25,7 @@ import kotlinx.coroutines.flow.update import kotlinx.coroutines.isActive import kotlinx.coroutines.launch import kotlinx.coroutines.sync.Mutex +import kotlinx.coroutines.sync.withLock import kotlinx.coroutines.tasks.await import kotlinx.coroutines.withContext import kotlinx.coroutines.withTimeoutOrNull @@ -108,6 +109,7 @@ class LightningRepo @Inject constructor( private val syncMutex = Mutex() private val syncPending = AtomicBoolean(false) private val syncRetryJob = AtomicReference(null) + private val lifecycleMutex = Mutex() init { observeConnectivityForSyncRetry() @@ -269,6 +271,15 @@ class LightningRepo @Inject constructor( eventHandler?.let { _eventHandlers.add(it) } + // Wait for any in-progress stop to complete to avoid race conditions + val currentLifecycleState = _lightningState.value.nodeLifecycleState + if (currentLifecycleState == NodeLifecycleState.Stopping) { + Logger.debug("Waiting for node to finish stopping before starting...", context = TAG) + withTimeoutOrNull(30.seconds) { + _lightningState.first { it.nodeLifecycleState != NodeLifecycleState.Stopping } + } ?: Logger.warn("Timeout waiting for node to stop, proceeding anyway", context = TAG) + } + val initialLifecycleState = _lightningState.value.nodeLifecycleState if (initialLifecycleState.isRunningOrStarting()) { Logger.info("LDK node start skipped, lifecycle state: $initialLifecycleState", context = TAG) @@ -374,16 +385,36 @@ class LightningRepo @Inject constructor( } suspend fun stop(): Result = withContext(bgDispatcher) { - if (_lightningState.value.nodeLifecycleState.isStoppedOrStopping()) { - return@withContext Result.success(Unit) - } + lifecycleMutex.withLock { + if (_lightningState.value.nodeLifecycleState.isStoppedOrStopping()) { + return@withLock Result.success(Unit) + } - runCatching { - _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Stopping) } - lightningService.stop() - _lightningState.update { LightningState(nodeLifecycleState = NodeLifecycleState.Stopped) } - }.onFailure { - Logger.error("Node stop error", it, context = TAG) + // Wait for any in-progress start to complete + val currentState = _lightningState.value.nodeLifecycleState + if (currentState == NodeLifecycleState.Starting) { + Logger.debug("Waiting for node to finish starting before stopping...", context = TAG) + withTimeoutOrNull(30.seconds) { + _lightningState.first { it.nodeLifecycleState != NodeLifecycleState.Starting } + } ?: Logger.warn("Timeout waiting for node to start, proceeding with stop", context = TAG) + } + + runCatching { + _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Stopping) } + lightningService.stop() + _lightningState.update { LightningState(nodeLifecycleState = NodeLifecycleState.Stopped) } + }.onFailure { + Logger.error("Node stop error", it, context = TAG) + // On failure, check actual node state and update accordingly + // If node is still running, revert to Running state to allow retry + if (lightningService.node != null && getStatus()?.isRunning == true) { + Logger.warn("Stop failed but node is still running, reverting to Running state", context = TAG) + _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Running) } + } else { + // Node appears stopped, update state + _lightningState.update { LightningState(nodeLifecycleState = NodeLifecycleState.Stopped) } + } + } } } From b2477a0c949280ee2741ecd1be7cf08d367f1417 Mon Sep 17 00:00:00 2001 From: jvsena42 Date: Tue, 27 Jan 2026 12:08:50 -0300 Subject: [PATCH 2/3] fix: get status directly from service --- app/src/main/java/to/bitkit/repositories/LightningRepo.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/src/main/java/to/bitkit/repositories/LightningRepo.kt b/app/src/main/java/to/bitkit/repositories/LightningRepo.kt index a2bd68db0..1af4e6a44 100644 --- a/app/src/main/java/to/bitkit/repositories/LightningRepo.kt +++ b/app/src/main/java/to/bitkit/repositories/LightningRepo.kt @@ -407,7 +407,7 @@ class LightningRepo @Inject constructor( Logger.error("Node stop error", it, context = TAG) // On failure, check actual node state and update accordingly // If node is still running, revert to Running state to allow retry - if (lightningService.node != null && getStatus()?.isRunning == true) { + if (lightningService.node != null && lightningService.status?.isRunning == true) { Logger.warn("Stop failed but node is still running, reverting to Running state", context = TAG) _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Running) } } else { From 27d4f0cfbe206cf837243c32d306497b64cf613c Mon Sep 17 00:00:00 2001 From: jvsena42 Date: Tue, 27 Jan 2026 13:37:53 -0300 Subject: [PATCH 3/3] fix: add mutex to start() for symmetric lifecycle protection Co-Authored-By: Claude Opus 4.5 --- .../to/bitkit/repositories/LightningRepo.kt | 162 +++++++++--------- 1 file changed, 77 insertions(+), 85 deletions(-) diff --git a/app/src/main/java/to/bitkit/repositories/LightningRepo.kt b/app/src/main/java/to/bitkit/repositories/LightningRepo.kt index 1af4e6a44..5ce6b4694 100644 --- a/app/src/main/java/to/bitkit/repositories/LightningRepo.kt +++ b/app/src/main/java/to/bitkit/repositories/LightningRepo.kt @@ -271,97 +271,98 @@ class LightningRepo @Inject constructor( eventHandler?.let { _eventHandlers.add(it) } - // Wait for any in-progress stop to complete to avoid race conditions - val currentLifecycleState = _lightningState.value.nodeLifecycleState - if (currentLifecycleState == NodeLifecycleState.Stopping) { - Logger.debug("Waiting for node to finish stopping before starting...", context = TAG) - withTimeoutOrNull(30.seconds) { - _lightningState.first { it.nodeLifecycleState != NodeLifecycleState.Stopping } - } ?: Logger.warn("Timeout waiting for node to stop, proceeding anyway", context = TAG) - } - - val initialLifecycleState = _lightningState.value.nodeLifecycleState - if (initialLifecycleState.isRunningOrStarting()) { - Logger.info("LDK node start skipped, lifecycle state: $initialLifecycleState", context = TAG) - return@withContext Result.success(Unit) - } + // Track retry state outside mutex to avoid deadlock (Mutex is non-reentrant) + var shouldRetryStart = false + var initialLifecycleState: NodeLifecycleState = NodeLifecycleState.Stopped + + val result = lifecycleMutex.withLock { + initialLifecycleState = _lightningState.value.nodeLifecycleState + if (initialLifecycleState.isRunningOrStarting()) { + Logger.info("LDK node start skipped, lifecycle state: $initialLifecycleState", context = TAG) + return@withLock Result.success(Unit) + } - runCatching { - _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Starting) } - - // Setup if needed - if (lightningService.node == null) { - val setupResult = setup(walletIndex, customServerUrl, customRgsServerUrl, channelMigration) - if (setupResult.isFailure) { - _lightningState.update { - it.copy( - nodeLifecycleState = NodeLifecycleState.ErrorStarting( - setupResult.exceptionOrNull() ?: NodeSetupError() + runCatching { + _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Starting) } + + // Setup if needed + if (lightningService.node == null) { + val setupResult = setup(walletIndex, customServerUrl, customRgsServerUrl, channelMigration) + if (setupResult.isFailure) { + _lightningState.update { + it.copy( + nodeLifecycleState = NodeLifecycleState.ErrorStarting( + setupResult.exceptionOrNull() ?: NodeSetupError() + ) ) - ) + } + return@withLock setupResult } - return@withContext setupResult } - } - if (getStatus()?.isRunning == true) { - Logger.info("LDK node already running", context = TAG) - _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Running) } - lightningService.startEventListener(::onEvent).onFailure { - Logger.warn("Failed to start event listener", it, context = TAG) - return@withContext Result.failure(it) + if (getStatus()?.isRunning == true) { + Logger.info("LDK node already running", context = TAG) + _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Running) } + lightningService.startEventListener(::onEvent).onFailure { + Logger.warn("Failed to start event listener", it, context = TAG) + return@withLock Result.failure(it) + } + return@withLock Result.success(Unit) } - return@withContext Result.success(Unit) - } - // Start node - lightningService.start(timeout, ::onEvent) + // Start node + lightningService.start(timeout, ::onEvent) - _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Running) } + _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Running) } - // Initial state sync - syncState() - updateGeoBlockState() - refreshChannelCache() + // Initial state sync + syncState() + updateGeoBlockState() + refreshChannelCache() - // Post-startup tasks (non-blocking) - connectToTrustedPeers().onFailure { - Logger.error("Failed to connect to trusted peers", it, context = TAG) - } + // Post-startup tasks (non-blocking) + connectToTrustedPeers().onFailure { + Logger.error("Failed to connect to trusted peers", it, context = TAG) + } - sync().onFailure { e -> - Logger.warn("Initial sync failed, event-driven sync will retry", e, context = TAG) - } - scope.launch { registerForNotifications() } - Unit - }.onFailure { e -> - val currentLifecycleState = _lightningState.value.nodeLifecycleState - if (currentLifecycleState.isRunning()) { - Logger.warn("Start error occurred but node is $currentLifecycleState, skipping retry", e, context = TAG) - return@withContext Result.success(Unit) - } + sync().onFailure { e -> + Logger.warn("Initial sync failed, event-driven sync will retry", e, context = TAG) + } + scope.launch { registerForNotifications() } + Result.success(Unit) + }.getOrElse { e -> + val currentState = _lightningState.value.nodeLifecycleState + if (currentState.isRunning()) { + Logger.warn("Start error but node is $currentState, skipping retry", e, context = TAG) + return@withLock Result.success(Unit) + } - if (shouldRetry) { - val retryDelay = 2.seconds - Logger.warn("Start error, retrying after $retryDelay...", e, context = TAG) - _lightningState.update { it.copy(nodeLifecycleState = initialLifecycleState) } - - delay(retryDelay) - return@withContext start( - walletIndex = walletIndex, - timeout = timeout, - shouldRetry = false, - customServerUrl = customServerUrl, - customRgsServerUrl = customRgsServerUrl, - channelMigration = channelMigration, - ) - } else { - _lightningState.update { - it.copy(nodeLifecycleState = NodeLifecycleState.ErrorStarting(e)) + if (shouldRetry) { + Logger.warn("Start error, will retry...", e, context = TAG) + _lightningState.update { it.copy(nodeLifecycleState = initialLifecycleState) } + shouldRetryStart = true + Result.failure(e) + } else { + _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.ErrorStarting(e)) } + Result.failure(e) } - return@withContext Result.failure(e) } } + + // Retry OUTSIDE the mutex to avoid deadlock (Kotlin Mutex is non-reentrant) + if (shouldRetryStart) { + delay(2.seconds) + return@withContext start( + walletIndex = walletIndex, + timeout = timeout, + shouldRetry = false, + customServerUrl = customServerUrl, + customRgsServerUrl = customRgsServerUrl, + channelMigration = channelMigration, + ) + } + + result } private suspend fun onEvent(event: Event) { @@ -390,15 +391,6 @@ class LightningRepo @Inject constructor( return@withLock Result.success(Unit) } - // Wait for any in-progress start to complete - val currentState = _lightningState.value.nodeLifecycleState - if (currentState == NodeLifecycleState.Starting) { - Logger.debug("Waiting for node to finish starting before stopping...", context = TAG) - withTimeoutOrNull(30.seconds) { - _lightningState.first { it.nodeLifecycleState != NodeLifecycleState.Starting } - } ?: Logger.warn("Timeout waiting for node to start, proceeding with stop", context = TAG) - } - runCatching { _lightningState.update { it.copy(nodeLifecycleState = NodeLifecycleState.Stopping) } lightningService.stop()