Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions include/my_atomic_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ template <typename Type> class Atomic_relaxed
std::memory_order o1= std::memory_order_relaxed,
std::memory_order o2= std::memory_order_relaxed)
{ return m.compare_exchange_strong(i1, i2, o1, o2); }
bool compare_exchange_weak(Type& i1, const Type i2,
std::memory_order o1= std::memory_order_relaxed,
std::memory_order o2= std::memory_order_relaxed)
{ return m.compare_exchange_weak(i1, i2, o1, o2); }
Type exchange(const Type i, std::memory_order o= std::memory_order_relaxed)
{ return m.exchange(i, o); }
};
Expand Down
114 changes: 83 additions & 31 deletions storage/innobase/buf/buf0flu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,10 @@ static constexpr ulint buf_flush_lsn_scan_factor = 3;
/** Average redo generation rate */
static lsn_t lsn_avg_rate = 0;

/** Target oldest_modification for the page cleaner background flushing;
writes are protected by buf_pool.flush_list_mutex */
/** Target oldest_modification for the page cleaner background flushing.
Bumped lock-free via monotonic-max CAS-loop from buf_flush_ahead();
cleared with CAS under buf_pool.flush_list_mutex from
buf_flush_page_cleaner() and buf_flush_sync_for_checkpoint(). */
static Atomic_relaxed<lsn_t> buf_flush_async_lsn;
/** Target oldest_modification for the page cleaner furious flushing;
writes are protected by buf_pool.flush_list_mutex */
Expand Down Expand Up @@ -112,6 +114,7 @@ static void buf_flush_validate_skip() noexcept

void buf_pool_t::page_cleaner_wakeup(bool for_LRU) noexcept
{
mysql_mutex_assert_owner(&flush_list_mutex);
ut_d(buf_flush_validate_skip());
if (!page_cleaner_idle())
{
Expand Down Expand Up @@ -154,7 +157,7 @@ void buf_pool_t::page_cleaner_wakeup(bool for_LRU) noexcept
last_activity_count == srv_get_activity_count())) ||
srv_max_buf_pool_modified_pct <= dirty_pct)
{
page_cleaner_status-= PAGE_CLEANER_IDLE;
page_cleaner_idle_flag= false;
pthread_cond_signal(&do_flush_list);
}
}
Expand Down Expand Up @@ -2113,33 +2116,55 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) noexcept

DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;);

Atomic_relaxed<lsn_t> &limit= furious
? buf_flush_sync_lsn : buf_flush_async_lsn;
if (!furious)
{
/* Non-furious: lock-free monotonic-max via CAS-loop. */
lsn_t cur= buf_flush_async_lsn;
while (cur < lsn)
if (buf_flush_async_lsn.compare_exchange_weak(cur, lsn))
goto bumped;
return;

bumped:
/* In non-furious mode, concurrent writes to the log will remain
possible, and we are gently requesting buf_flush_page_cleaner()
to do more work to avoid a later call with furious=true.
We will only wake the buf_flush_page_cleaner() from an indefinite
my_cond_wait(), but we will not disturb the regular 1-second sleep.

To reduce contention for a gentle request case, the idle flag of
buf_pool is read outside of a buf_pool.flush_list_mutex critical
section, allowing early return.

If the decision to return early was (unlikely) wrong, subsequent
wake-side paths (including further buf_flush_ahead() calls under
redo-log pressure) are expected to perform the wake eventually. */
if (!buf_pool.page_cleaner_idle())
return;

if (limit < lsn)
/* Signal under buf_pool.flush_list_mutex. */
mysql_mutex_lock(&buf_pool.flush_list_mutex);
goto wake;
}

/* The furious path forces a log checkpoint via
log_sys.set_check_for_checkpoint(true), so the buf_flush_sync_lsn
bump, the checkpoint flag, and the cleaner wake must all be observed
together by any thread synchronising on buf_pool.flush_list_mutex. */
if (buf_flush_sync_lsn < lsn)
{
mysql_mutex_lock(&buf_pool.flush_list_mutex);
if (limit < lsn)
if (buf_flush_sync_lsn < lsn)
{
limit= lsn;
if (furious)
{
/* Request any concurrent threads to wait for this batch to complete,
in log_free_check(). */
log_sys.set_check_for_checkpoint(true);
/* Immediately wake up buf_flush_page_cleaner(), even when it
is in the middle of a 1-second my_cond_timedwait(). */
wake:
buf_pool.page_cleaner_set_idle(false);
pthread_cond_signal(&buf_pool.do_flush_list);
}
else if (buf_pool.page_cleaner_idle())
/* In non-furious mode, concurrent writes to the log will remain
possible, and we are gently requesting buf_flush_page_cleaner()
to do more work to avoid a later call with furious=true.
We will only wake the buf_flush_page_cleaner() from an indefinite
my_cond_wait(), but we will not disturb the regular 1-second sleep. */
goto wake;
buf_flush_sync_lsn= lsn;
/* Request any concurrent threads to wait for this batch to complete,
in log_free_check(). */
log_sys.set_check_for_checkpoint(true);
/* Immediately wake up buf_flush_page_cleaner(), even when it
is in the middle of a 1-second my_cond_timedwait(). */
wake:
buf_pool.page_cleaner_set_idle(false);
pthread_cond_signal(&buf_pool.do_flush_list);
}
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
}
Expand Down Expand Up @@ -2229,8 +2254,15 @@ static void buf_flush_sync_for_checkpoint(lsn_t lsn) noexcept
/* After attempting log checkpoint, check if we have reached our target. */
if (measure >= buf_flush_sync_lsn)
buf_flush_sync_lsn= 0;
else if (measure >= buf_flush_async_lsn)
buf_flush_async_lsn= 0;
else
{
/* Same coordination as in buf_flush_page_cleaner(): clear only if
unchanged since the snapshot, so a concurrent buf_flush_ahead() bump
is not lost. */
lsn_t async= buf_flush_async_lsn;
if (measure >= async)
buf_flush_async_lsn.compare_exchange_strong(async, 0);
}

log_sys.latch.wr_unlock();
/* wake up buf_flush_wait() */
Expand Down Expand Up @@ -2582,7 +2614,14 @@ static void buf_flush_page_cleaner() noexcept
if (UNIV_UNLIKELY(soft_lsn_limit != 0))
{
if (oldest_lsn >= soft_lsn_limit)
buf_flush_async_lsn= soft_lsn_limit= 0;
{
/* Coordinate with the lock-free bumper in buf_flush_ahead():
only clear if no concurrent thread has raised the target above
the snapshot we observed. If the CAS fails, the bumped value
survives and the next iteration picks it up. */
buf_flush_async_lsn.compare_exchange_strong(soft_lsn_limit, 0);
soft_lsn_limit= 0;
}
}
else if (buf_pool.need_LRU_eviction())
{
Expand All @@ -2605,8 +2644,13 @@ static void buf_flush_page_cleaner() noexcept
oldest_lsn= buf_pool.get_oldest_modification(0);
if (!oldest_lsn)
goto fully_unemployed;
if (oldest_lsn >= buf_flush_async_lsn)
buf_flush_async_lsn= 0;
{
/* Same coordination as above: clear only if unchanged since
the snapshot, so a concurrent buf_flush_ahead() bump is not lost. */
lsn_t async= buf_flush_async_lsn;
if (oldest_lsn >= async)
buf_flush_async_lsn.compare_exchange_strong(async, 0);
}
buf_pool.page_cleaner_set_idle(false);
goto set_almost_idle;
}
Expand Down Expand Up @@ -2652,6 +2696,14 @@ static void buf_flush_page_cleaner() noexcept
}
else if (dirty_pct < srv_max_buf_pool_modified_pct)
possibly_unemployed:
/* Snapshot-based; no coordination with concurrent lock-free
buf_flush_ahead() bumpers. A bumper that observed
page_cleaner_idle()==false earlier in this iteration took its
early return; flipping idle=true here leaves its target
unacknowledged until another wake path fires. Same premise as the
bumper's early return: under sustained redo-log pressure,
subsequent buf_flush_ahead() calls observe
page_cleaner_idle()==true and take the wake path. */
if (!soft_lsn_limit && !af_needed_for_redo(oldest_lsn))
goto set_idle;

Expand Down
24 changes: 14 additions & 10 deletions storage/innobase/include/buf0buf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1532,13 +1532,18 @@ class buf_pool_t
TPOOL_SUPPRESS_TSAN void add_flush_list_requests(size_t size)
{ ut_ad(size); flush_list_requests+= size; }
private:
static constexpr unsigned PAGE_CLEANER_IDLE= 1;
static constexpr unsigned FLUSH_LIST_ACTIVE= 2;
static constexpr unsigned LRU_FLUSH= 4;

/** Number of pending LRU flush * LRU_FLUSH +
PAGE_CLEANER_IDLE + FLUSH_LIST_ACTIVE flags */
/** Number of pending LRU flush * LRU_FLUSH + FLUSH_LIST_ACTIVE flag.
Protected by flush_list_mutex. */
unsigned page_cleaner_status;

/** Whether the page cleaner is sleeping due to being idle.
Writes occur under flush_list_mutex; reads are lock-free (used by
the early-return in buf_flush_ahead() on a busy cleaner). */
Atomic_relaxed<bool> page_cleaner_idle_flag;

/** track server activity count for signaling idle flushing */
ulint last_activity_count;
public:
Expand Down Expand Up @@ -1580,19 +1585,19 @@ class buf_pool_t
page_cleaner_status-= FLUSH_LIST_ACTIVE;
}

/** @return whether the page cleaner must sleep due to being idle */
/** @return whether the page cleaner must sleep due to being idle.
Lock-free read of page_cleaner_idle_flag; safe to call without
flush_list_mutex (used by the early-return in buf_flush_ahead()). */
bool page_cleaner_idle() const noexcept
{
mysql_mutex_assert_owner(&flush_list_mutex);
return page_cleaner_status & PAGE_CLEANER_IDLE;
return page_cleaner_idle_flag;
}

/** @return whether the page cleaner may be initiating writes */
bool page_cleaner_active() const noexcept
{
mysql_mutex_assert_owner(&flush_list_mutex);
static_assert(PAGE_CLEANER_IDLE == 1, "efficiency");
return page_cleaner_status > PAGE_CLEANER_IDLE;
return page_cleaner_status != 0;
}

/** Wake up the page cleaner if needed.
Expand All @@ -1603,8 +1608,7 @@ class buf_pool_t
void page_cleaner_set_idle(bool deep_sleep) noexcept
{
mysql_mutex_assert_owner(&flush_list_mutex);
page_cleaner_status= (page_cleaner_status & ~PAGE_CLEANER_IDLE) |
(PAGE_CLEANER_IDLE * deep_sleep);
page_cleaner_idle_flag= deep_sleep;
}

/** Update server last activity count */
Expand Down
Loading