From 4441412c44856a1c1169f9cf1252ed76a858b288 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 31 Jan 2026 22:33:08 +0000 Subject: [PATCH 1/3] Add tests for libev atexit cleanup bug - Added test_libevreactor_shutdown.py to demonstrate the bug - Tests show that atexit callback captures None instead of actual loop Co-authored-by: fruch <340979+fruch@users.noreply.github.com> --- tests/unit/io/test_libevreactor_shutdown.py | 250 ++++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 tests/unit/io/test_libevreactor_shutdown.py diff --git a/tests/unit/io/test_libevreactor_shutdown.py b/tests/unit/io/test_libevreactor_shutdown.py new file mode 100644 index 0000000000..6be2c2b647 --- /dev/null +++ b/tests/unit/io/test_libevreactor_shutdown.py @@ -0,0 +1,250 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Test to demonstrate the libevwrapper atexit cleanup issue. + +This test demonstrates the problem where the atexit callback is registered +with _global_loop=None at import time, causing it to receive None during +shutdown instead of the actual loop instance. +""" + +import unittest +import atexit +import sys +import subprocess +import tempfile +import os +from pathlib import Path + +from cassandra import DependencyException + +try: + from cassandra.io.libevreactor import LibevConnection +except (ImportError, DependencyException): + LibevConnection = None + +from tests import is_monkey_patched + + +class LibevAtexitCleanupTest(unittest.TestCase): + """ + Test case to demonstrate the atexit cleanup bug in libevreactor. + + The bug: atexit.register(partial(_cleanup, _global_loop)) is called when + _global_loop is None, so the cleanup function receives None at shutdown + instead of the actual LibevLoop instance that was created later. + """ + + def setUp(self): + if is_monkey_patched(): + raise unittest.SkipTest("Can't test libev with monkey patching") + if LibevConnection is None: + raise unittest.SkipTest('libev does not appear to be installed correctly') + + def test_atexit_callback_registered_with_none(self): + """ + Test that demonstrates the atexit callback bug. + + The atexit.register(partial(_cleanup, _global_loop)) line is executed + when _global_loop is None. This means the partial function captures + None as the argument, and when atexit calls it during shutdown, it + passes None to _cleanup instead of the actual loop instance. + + @since 3.29 + @jira_ticket PYTHON-XXX + @expected_result The test demonstrates that atexit cleanup is broken + + @test_category connection + """ + from cassandra.io import libevreactor + from functools import partial + + # Check the current atexit handlers + # Note: atexit._exithandlers is an implementation detail but useful for debugging + if hasattr(atexit, '_exithandlers'): + # Find our cleanup handler + cleanup_handler = None + for handler in atexit._exithandlers: + func = handler[0] + # Check if this is our partial(_cleanup, _global_loop) handler + if isinstance(func, partial): + if func.func.__name__ == '_cleanup': + cleanup_handler = func + break + + if cleanup_handler: + # The problem: the partial was created with _global_loop=None + # So even if _global_loop is later set to a LibevLoop instance, + # the atexit callback will still call _cleanup(None) + captured_arg = cleanup_handler.args[0] if cleanup_handler.args else None + + # This assertion will fail after LibevConnection.initialize_reactor() + # is called and _global_loop is set to a LibevLoop instance + LibevConnection.initialize_reactor() + + # At this point, libevreactor._global_loop is not None + self.assertIsNotNone(libevreactor._global_loop, + "Global loop should be initialized") + + # But the atexit handler still has None captured! + self.assertIsNone(captured_arg, + "The atexit handler captured None, not the actual loop instance. " + "This is the BUG: cleanup will receive None at shutdown!") + + def test_shutdown_crash_scenario_subprocess(self): + """ + Test that simulates a Python shutdown crash scenario in a subprocess. + + This test creates a minimal script that: + 1. Imports the driver + 2. Creates a connection (which starts the event loop) + 3. Exits without explicit cleanup + + The expected behavior is that atexit should clean up the loop, but + because of the bug, the cleanup receives None and doesn't actually + stop the loop or its watchers. This can lead to crashes if callbacks + fire during shutdown. + + @since 3.29 + @jira_ticket PYTHON-XXX + @expected_result The subprocess demonstrates the cleanup issue + + @test_category connection + """ + # Create a test script that demonstrates the issue + test_script = ''' +import sys +import os + +# Add the driver path +sys.path.insert(0, {driver_path!r}) + +# Import and setup +from cassandra.io.libevreactor import LibevConnection, _global_loop +import atexit + +# Initialize the reactor (creates the global loop) +LibevConnection.initialize_reactor() + +print("Global loop initialized:", _global_loop is not None) + +# Check what atexit will actually call +if hasattr(atexit, '_exithandlers'): + from functools import partial + for handler in atexit._exithandlers: + func = handler[0] + if isinstance(func, partial) and func.func.__name__ == '_cleanup': + captured_arg = func.args[0] if func.args else None + print("Atexit will call _cleanup with:", captured_arg) + print("But _global_loop is:", _global_loop) + print("BUG: Cleanup will receive None instead of the loop!") + break + +# Exit without explicit cleanup - atexit should handle it, but won't! +print("Exiting...") +''' + + driver_path = str(Path(__file__).parent.parent.parent.parent) + script_content = test_script.format(driver_path=driver_path) + + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: + f.write(script_content) + script_path = f.name + + try: + result = subprocess.run( + [sys.executable, script_path], + capture_output=True, + text=True, + timeout=5 + ) + + output = result.stdout + print("\n=== Subprocess Output ===") + print(output) + print("=== End Output ===\n") + + # Verify the output shows the bug + self.assertIn("Global loop initialized: True", output) + self.assertIn("Atexit will call _cleanup with: None", output) + self.assertIn("BUG: Cleanup will receive None instead of the loop!", output) + + finally: + os.unlink(script_path) + + +class LibevShutdownRaceConditionTest(unittest.TestCase): + """ + Tests to analyze potential race conditions and crashes during shutdown. + """ + + def setUp(self): + if is_monkey_patched(): + raise unittest.SkipTest("Can't test libev with monkey patching") + if LibevConnection is None: + raise unittest.SkipTest('libev does not appear to be installed correctly') + + def test_callback_during_shutdown_scenario(self): + """ + Test to document the potential crash scenario. + + When Python is shutting down: + 1. Various modules are being torn down + 2. The libev event loop may still be running + 3. If a callback (io_callback, timer_callback, prepare_callback) fires: + - It calls PyGILState_Ensure() + - It tries to call Python functions (PyObject_CallFunction) + - If Python objects have been deallocated, this can crash + + The root cause: The atexit cleanup doesn't actually run because it + receives None instead of the loop instance, so it never: + - Sets _shutdown flag + - Stops watchers + - Joins the event loop thread + + @since 3.29 + @jira_ticket PYTHON-XXX + @expected_result Documents the crash scenario + + @test_category connection + """ + from cassandra.io.libevreactor import _global_loop, _cleanup + + # This test documents the issue - we can't easily reproduce a crash + # in a unit test without actually tearing down Python, but we can + # verify the conditions that lead to it + + LibevConnection.initialize_reactor() + + # Verify the loop exists + self.assertIsNotNone(_global_loop) + + # Simulate what atexit would call (with the bug) + _cleanup(None) # BUG: receives None instead of _global_loop + + # The loop is still running because cleanup did nothing! + self.assertFalse(_global_loop._shutdown, + "Loop should NOT be shut down when cleanup receives None") + + # Now call it correctly + _cleanup(_global_loop) + + # Now it should be shut down + self.assertTrue(_global_loop._shutdown, + "Loop should be shut down when cleanup receives the actual loop") + + +if __name__ == '__main__': + unittest.main() From 3fa818c0aa1a4febc9d70c3f158de014c1343ce6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 3 Feb 2026 12:06:05 +0000 Subject: [PATCH 2/3] Implement Solution 1: Fix atexit registration to prevent shutdown crashes This implements the minimal fix for the libev atexit cleanup bug. Changes: - Replace atexit.register(partial(_cleanup, _global_loop)) with a wrapper function _atexit_cleanup() that looks up _global_loop at shutdown time - Remove unused 'partial' import from functools - Update tests to verify the fix works correctly The bug was that partial() captured _global_loop=None at import time, so cleanup always received None at shutdown instead of the actual LibevLoop instance. This prevented proper cleanup, leaving active callbacks that could crash during Python interpreter shutdown. The fix ensures _global_loop is looked up when atexit calls the cleanup, not when the callback is registered, so cleanup receives the actual loop instance and can properly shut down watchers and join the event loop thread. Co-authored-by: fruch <340979+fruch@users.noreply.github.com> --- cassandra/io/libevreactor.py | 15 +- tests/unit/io/test_libevreactor_shutdown.py | 198 +++++++++----------- 2 files changed, 105 insertions(+), 108 deletions(-) diff --git a/cassandra/io/libevreactor.py b/cassandra/io/libevreactor.py index d7b365e451..10b9dd5830 100644 --- a/cassandra/io/libevreactor.py +++ b/cassandra/io/libevreactor.py @@ -13,7 +13,6 @@ # limitations under the License. import atexit from collections import deque -from functools import partial import logging import os import socket @@ -227,8 +226,20 @@ def _loop_will_run(self, prepare): self._notifier.send() +def _atexit_cleanup(): + """Cleanup function called by atexit that uses the current _global_loop value. + + This wrapper ensures that cleanup receives the actual LibevLoop instance + instead of None, which was the value of _global_loop when the module was + imported. + """ + global _global_loop + if _global_loop is not None: + _cleanup(_global_loop) + + _global_loop = None -atexit.register(partial(_cleanup, _global_loop)) +atexit.register(_atexit_cleanup) class LibevConnection(Connection): diff --git a/tests/unit/io/test_libevreactor_shutdown.py b/tests/unit/io/test_libevreactor_shutdown.py index 6be2c2b647..5c44bca3aa 100644 --- a/tests/unit/io/test_libevreactor_shutdown.py +++ b/tests/unit/io/test_libevreactor_shutdown.py @@ -21,7 +21,6 @@ """ import unittest -import atexit import sys import subprocess import tempfile @@ -53,77 +52,67 @@ def setUp(self): if LibevConnection is None: raise unittest.SkipTest('libev does not appear to be installed correctly') - def test_atexit_callback_registered_with_none(self): + def test_atexit_callback_uses_current_global_loop(self): """ - Test that demonstrates the atexit callback bug. + Test that verifies the atexit callback fix. - The atexit.register(partial(_cleanup, _global_loop)) line is executed - when _global_loop is None. This means the partial function captures - None as the argument, and when atexit calls it during shutdown, it - passes None to _cleanup instead of the actual loop instance. + The fix uses a wrapper function _atexit_cleanup() that looks up the + current value of _global_loop at shutdown time, instead of capturing + it at import time with partial(). @since 3.29 @jira_ticket PYTHON-XXX - @expected_result The test demonstrates that atexit cleanup is broken + @expected_result The atexit handler calls cleanup with the actual loop @test_category connection """ from cassandra.io import libevreactor - from functools import partial - # Check the current atexit handlers - # Note: atexit._exithandlers is an implementation detail but useful for debugging - if hasattr(atexit, '_exithandlers'): - # Find our cleanup handler - cleanup_handler = None - for handler in atexit._exithandlers: - func = handler[0] - # Check if this is our partial(_cleanup, _global_loop) handler - if isinstance(func, partial): - if func.func.__name__ == '_cleanup': - cleanup_handler = func - break - - if cleanup_handler: - # The problem: the partial was created with _global_loop=None - # So even if _global_loop is later set to a LibevLoop instance, - # the atexit callback will still call _cleanup(None) - captured_arg = cleanup_handler.args[0] if cleanup_handler.args else None - - # This assertion will fail after LibevConnection.initialize_reactor() - # is called and _global_loop is set to a LibevLoop instance - LibevConnection.initialize_reactor() - - # At this point, libevreactor._global_loop is not None - self.assertIsNotNone(libevreactor._global_loop, - "Global loop should be initialized") - - # But the atexit handler still has None captured! - self.assertIsNone(captured_arg, - "The atexit handler captured None, not the actual loop instance. " - "This is the BUG: cleanup will receive None at shutdown!") - - def test_shutdown_crash_scenario_subprocess(self): + # Verify the fix: _atexit_cleanup should exist as a module-level function + self.assertTrue(hasattr(libevreactor, '_atexit_cleanup'), + "Module should have _atexit_cleanup function") + + # Verify it's not a partial (the old buggy implementation) + from functools import partial + self.assertNotIsInstance(libevreactor._atexit_cleanup, partial, + "The _atexit_cleanup should NOT be a partial function") + + # Verify it's actually a function + self.assertTrue(callable(libevreactor._atexit_cleanup), + "_atexit_cleanup should be callable") + + # Initialize the reactor + LibevConnection.initialize_reactor() + + # At this point, libevreactor._global_loop is not None + self.assertIsNotNone(libevreactor._global_loop, + "Global loop should be initialized") + + # The fix: _atexit_cleanup is a function that will look up + # _global_loop when it's called, not a partial with captured args + self.assertEqual(libevreactor._atexit_cleanup.__name__, '_atexit_cleanup', + "The function should have the correct name") + + def test_shutdown_cleanup_works_with_fix(self): """ - Test that simulates a Python shutdown crash scenario in a subprocess. + Test that verifies the atexit cleanup fix works in a subprocess. This test creates a minimal script that: 1. Imports the driver - 2. Creates a connection (which starts the event loop) - 3. Exits without explicit cleanup + 2. Initializes the reactor (creates the global loop) + 3. Verifies the _atexit_cleanup function is available + 4. Exits without explicit cleanup - The expected behavior is that atexit should clean up the loop, but - because of the bug, the cleanup receives None and doesn't actually - stop the loop or its watchers. This can lead to crashes if callbacks - fire during shutdown. + With the fix, atexit should properly clean up the loop using the + wrapper function that looks up _global_loop at shutdown time. @since 3.29 @jira_ticket PYTHON-XXX - @expected_result The subprocess demonstrates the cleanup issue + @expected_result The subprocess shows the fix is working @test_category connection """ - # Create a test script that demonstrates the issue + # Create a test script that verifies the fix test_script = ''' import sys import os @@ -132,28 +121,29 @@ def test_shutdown_crash_scenario_subprocess(self): sys.path.insert(0, {driver_path!r}) # Import and setup -from cassandra.io.libevreactor import LibevConnection, _global_loop +from cassandra.io import libevreactor +from cassandra.io.libevreactor import LibevConnection import atexit # Initialize the reactor (creates the global loop) LibevConnection.initialize_reactor() -print("Global loop initialized:", _global_loop is not None) - -# Check what atexit will actually call -if hasattr(atexit, '_exithandlers'): - from functools import partial - for handler in atexit._exithandlers: - func = handler[0] - if isinstance(func, partial) and func.func.__name__ == '_cleanup': - captured_arg = func.args[0] if func.args else None - print("Atexit will call _cleanup with:", captured_arg) - print("But _global_loop is:", _global_loop) - print("BUG: Cleanup will receive None instead of the loop!") - break - -# Exit without explicit cleanup - atexit should handle it, but won't! -print("Exiting...") +print("Global loop initialized:", libevreactor._global_loop is not None) + +# Verify the fix is in place: _atexit_cleanup should be a module-level function +if hasattr(libevreactor, '_atexit_cleanup'): + print("FIXED: Module has _atexit_cleanup function") + print("This function will look up _global_loop at shutdown time") + # Verify it's not using partial with None + import inspect + source = inspect.getsource(libevreactor._atexit_cleanup) + if "global _global_loop" in source and "_global_loop is not None" in source: + print("Verified: _atexit_cleanup uses current _global_loop value") +else: + print("BUG: No _atexit_cleanup function found") + +# Exit without explicit cleanup - atexit should handle it properly with the fix! +print("Exiting with proper cleanup...") ''' driver_path = str(Path(__file__).parent.parent.parent.parent) @@ -176,11 +166,12 @@ def test_shutdown_crash_scenario_subprocess(self): print(output) print("=== End Output ===\n") - # Verify the output shows the bug + # Verify the output shows the fix is working self.assertIn("Global loop initialized: True", output) - self.assertIn("Atexit will call _cleanup with: None", output) - self.assertIn("BUG: Cleanup will receive None instead of the loop!", output) - + self.assertIn("FIXED: Module has _atexit_cleanup function", output) + self.assertIn("Verified: _atexit_cleanup uses current _global_loop value", output) + self.assertNotIn("BUG", output.replace("BUG STILL PRESENT", "").replace("DEBUG", "")) # Allow "BUG" only in success message + finally: os.unlink(script_path) @@ -196,54 +187,49 @@ def setUp(self): if LibevConnection is None: raise unittest.SkipTest('libev does not appear to be installed correctly') - def test_callback_during_shutdown_scenario(self): + def test_cleanup_with_fix_properly_shuts_down(self): """ - Test to document the potential crash scenario. - - When Python is shutting down: - 1. Various modules are being torn down - 2. The libev event loop may still be running - 3. If a callback (io_callback, timer_callback, prepare_callback) fires: - - It calls PyGILState_Ensure() - - It tries to call Python functions (PyObject_CallFunction) - - If Python objects have been deallocated, this can crash - - The root cause: The atexit cleanup doesn't actually run because it - receives None instead of the loop instance, so it never: - - Sets _shutdown flag - - Stops watchers - - Joins the event loop thread + Test to verify the fix properly shuts down the event loop. + + With the fix in place, the atexit cleanup will: + 1. Look up the current _global_loop value (not None) + 2. Call _cleanup with the actual loop instance + 3. Properly shut down the loop and its watchers + + This prevents the crash scenario where: + - Various modules are being torn down during Python shutdown + - The libev event loop is still running + - Callbacks fire and try to access deallocated Python objects @since 3.29 @jira_ticket PYTHON-XXX - @expected_result Documents the crash scenario + @expected_result Cleanup properly shuts down the loop with the fix @test_category connection """ - from cassandra.io.libevreactor import _global_loop, _cleanup - - # This test documents the issue - we can't easily reproduce a crash - # in a unit test without actually tearing down Python, but we can - # verify the conditions that lead to it - + from cassandra.io import libevreactor + from cassandra.io.libevreactor import _cleanup, _atexit_cleanup + LibevConnection.initialize_reactor() # Verify the loop exists - self.assertIsNotNone(_global_loop) - - # Simulate what atexit would call (with the bug) - _cleanup(None) # BUG: receives None instead of _global_loop + self.assertIsNotNone(libevreactor._global_loop) + + # Before cleanup, the loop should not be shut down + self.assertFalse(libevreactor._global_loop._shutdown, + "Loop should not be shut down initially") - # The loop is still running because cleanup did nothing! - self.assertFalse(_global_loop._shutdown, + # Simulate what the OLD buggy code would do + _cleanup(None) # This does nothing + self.assertFalse(libevreactor._global_loop._shutdown, "Loop should NOT be shut down when cleanup receives None") - # Now call it correctly - _cleanup(_global_loop) + # Now test the FIX: call the wrapper that looks up _global_loop + _atexit_cleanup() # This is what atexit will actually call - # Now it should be shut down - self.assertTrue(_global_loop._shutdown, - "Loop should be shut down when cleanup receives the actual loop") + # With the fix, the loop should be properly shut down + self.assertTrue(libevreactor._global_loop._shutdown, + "Loop should be shut down when _atexit_cleanup is called") if __name__ == '__main__': From fe0fa48fa5d73df3b1f5bd00600120091631e507 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Feb 2026 03:13:24 +0000 Subject: [PATCH 3/3] Update copyright header to ScyllaDB in test_libevreactor_shutdown.py Changed copyright from DataStax to ScyllaDB to match the project's standard for new files, as requested by reviewer. Co-authored-by: dkropachev <40304587+dkropachev@users.noreply.github.com> --- tests/unit/io/test_libevreactor_shutdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/io/test_libevreactor_shutdown.py b/tests/unit/io/test_libevreactor_shutdown.py index 5c44bca3aa..9578d22df1 100644 --- a/tests/unit/io/test_libevreactor_shutdown.py +++ b/tests/unit/io/test_libevreactor_shutdown.py @@ -1,4 +1,4 @@ -# Copyright DataStax, Inc. +# Copyright ScyllaDB, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.