From 79b97411af586acdb36dfc4a8003a48779901547 Mon Sep 17 00:00:00 2001 From: yanfeng Date: Mon, 9 Feb 2026 13:58:10 +0800 Subject: [PATCH 1/5] feat(auto_cl): add error rate threshold for punishment attenuation Add new GFlag `auto_cl_error_rate_punish_threshold` to enable error-rate-based punishment attenuation in AutoConcurrencyLimiter. Problem: Low error rates (e.g., 1.3% sporadic timeouts) cause disproportionate avg_latency inflation (+31%), leading the limiter to mistakenly shrink max_concurrency and trigger ELIMIT rejections. Solution: Inspired by Alibaba Sentinel's threshold-based approach: - threshold=0 (default): Original behavior preserved (backward compat) - threshold>0 (e.g., 0.1): Error rates below threshold produce zero punishment; above it, punishment scales linearly from 0 to full Example: With threshold=0.1, a 5% error rate produces no punishment, while a 50% error rate produces 44% of the original punishment. Co-Authored-By: Claude Opus 4.5 --- src/brpc/policy/auto_concurrency_limiter.cpp | 30 ++- ...brpc_auto_concurrency_limiter_unittest.cpp | 208 ++++++++++++++++++ 2 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 test/brpc_auto_concurrency_limiter_unittest.cpp diff --git a/src/brpc/policy/auto_concurrency_limiter.cpp b/src/brpc/policy/auto_concurrency_limiter.cpp index dd5a02ec99..220d8a1693 100644 --- a/src/brpc/policy/auto_concurrency_limiter.cpp +++ b/src/brpc/policy/auto_concurrency_limiter.cpp @@ -77,6 +77,14 @@ DEFINE_int32(auto_cl_latency_fluctuation_correction_factor, 1, "the value, the higher the tolerance for the fluctuation of the " "latency. If the value is too large, the latency will be higher " "when the server is overloaded."); +DEFINE_double(auto_cl_error_rate_punish_threshold, 0, + "Threshold for error-rate-based punishment attenuation. " + "0 (default): no effect, original punishment logic is used. " + "> 0 (e.g. 0.1): error rates below this threshold produce zero " + "punishment; above it the punishment scales linearly from 0 to " + "full strength. Only effective when auto_cl_enable_error_punish " + "is true. Example: 0.1 means error rates below 10%% are not " + "punished."); AutoConcurrencyLimiter::AutoConcurrencyLimiter() : _max_concurrency(FLAGS_auto_cl_initial_max_concurrency) @@ -236,7 +244,27 @@ void AutoConcurrencyLimiter::AdjustMaxConcurrency(int next_max_concurrency) { void AutoConcurrencyLimiter::UpdateMaxConcurrency(int64_t sampling_time_us) { int32_t total_succ_req = _total_succ_req.load(butil::memory_order_relaxed); double failed_punish = _sw.total_failed_us * FLAGS_auto_cl_fail_punish_ratio; - int64_t avg_latency = + + // Threshold-based attenuation: when auto_cl_error_rate_punish_threshold > 0, + // attenuate punishment based on error rate. Inspired by Sentinel's threshold- + // based circuit breaker: low error rates should not inflate avg_latency. + // Above threshold, punishment scales linearly from 0 to full strength. + // When threshold is 0 (default), this block is skipped entirely. + if (FLAGS_auto_cl_error_rate_punish_threshold > 0 && _sw.failed_count > 0) { + double threshold = FLAGS_auto_cl_error_rate_punish_threshold; + double error_rate = static_cast(_sw.failed_count) / + (_sw.succ_count + _sw.failed_count); + if (error_rate <= threshold) { + // Error rate within dead zone, cancel punishment. + failed_punish = 0; + } else { + // Linear ramp: 0 at threshold, 1.0 at 100% error rate. + double punish_factor = (error_rate - threshold) / (1.0 - threshold); + failed_punish *= punish_factor; + } + } + + int64_t avg_latency = std::ceil((failed_punish + _sw.total_succ_us) / _sw.succ_count); double qps = 1000000.0 * total_succ_req / (sampling_time_us - _sw.start_time_us); UpdateMinLatency(avg_latency); diff --git a/test/brpc_auto_concurrency_limiter_unittest.cpp b/test/brpc_auto_concurrency_limiter_unittest.cpp new file mode 100644 index 0000000000..7bece930c9 --- /dev/null +++ b/test/brpc_auto_concurrency_limiter_unittest.cpp @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "brpc/policy/auto_concurrency_limiter.h" +#include "butil/time.h" +#include "bthread/bthread.h" +#include + +namespace brpc { +namespace policy { + +DECLARE_int32(auto_cl_sample_window_size_ms); +DECLARE_int32(auto_cl_min_sample_count); +DECLARE_int32(auto_cl_max_sample_count); +DECLARE_bool(auto_cl_enable_error_punish); +DECLARE_double(auto_cl_fail_punish_ratio); +DECLARE_double(auto_cl_error_rate_punish_threshold); + +} // namespace policy +} // namespace brpc + +class AutoConcurrencyLimiterTest : public ::testing::Test { +protected: + void SetUp() override { + // Save original values + orig_sample_window_size_ms_ = brpc::policy::FLAGS_auto_cl_sample_window_size_ms; + orig_min_sample_count_ = brpc::policy::FLAGS_auto_cl_min_sample_count; + orig_max_sample_count_ = brpc::policy::FLAGS_auto_cl_max_sample_count; + orig_enable_error_punish_ = brpc::policy::FLAGS_auto_cl_enable_error_punish; + orig_fail_punish_ratio_ = brpc::policy::FLAGS_auto_cl_fail_punish_ratio; + orig_error_rate_threshold_ = brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold; + + // Set test-friendly values + brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 1000; + brpc::policy::FLAGS_auto_cl_min_sample_count = 5; + brpc::policy::FLAGS_auto_cl_max_sample_count = 200; + brpc::policy::FLAGS_auto_cl_enable_error_punish = true; + brpc::policy::FLAGS_auto_cl_fail_punish_ratio = 1.0; + } + + void TearDown() override { + // Restore original values + brpc::policy::FLAGS_auto_cl_sample_window_size_ms = orig_sample_window_size_ms_; + brpc::policy::FLAGS_auto_cl_min_sample_count = orig_min_sample_count_; + brpc::policy::FLAGS_auto_cl_max_sample_count = orig_max_sample_count_; + brpc::policy::FLAGS_auto_cl_enable_error_punish = orig_enable_error_punish_; + brpc::policy::FLAGS_auto_cl_fail_punish_ratio = orig_fail_punish_ratio_; + brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = orig_error_rate_threshold_; + } + +private: + int32_t orig_sample_window_size_ms_; + int32_t orig_min_sample_count_; + int32_t orig_max_sample_count_; + bool orig_enable_error_punish_; + double orig_fail_punish_ratio_; + double orig_error_rate_threshold_; +}; + +// Helper function to add samples and trigger window completion +void AddSamplesAndTriggerWindow(brpc::policy::AutoConcurrencyLimiter& limiter, + int succ_count, int64_t succ_latency, + int fail_count, int64_t fail_latency) { + int64_t now = butil::gettimeofday_us(); + + // Add successful samples + for (int i = 0; i < succ_count; ++i) { + limiter.AddSample(0, succ_latency, now); + } + // Add failed samples + for (int i = 0; i < fail_count; ++i) { + limiter.AddSample(1, fail_latency, now); + } + + // Wait for window to expire and trigger update + bthread_usleep(brpc::policy::FLAGS_auto_cl_sample_window_size_ms * 1000 + 1000); + + // Add one more sample to trigger window submission + limiter.AddSample(0, succ_latency, butil::gettimeofday_us()); +} + +// Test: When threshold is 0 (default), behavior is unchanged - punishment is applied +TEST_F(AutoConcurrencyLimiterTest, ThresholdZeroPreservesOriginalBehavior) { + brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0; + brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; // Short window for testing + + brpc::policy::AutoConcurrencyLimiter limiter; + + AddSamplesAndTriggerWindow(limiter, 90, 100, 10, 1000); + + // With threshold=0, failed_punish should NOT be attenuated + // avg_latency = (10*1000 + 90*100) / 90 = (10000 + 9000) / 90 = 211us + // This is significantly inflated from the actual success latency of 100us + // _min_latency_us should reflect this inflation + ASSERT_GT(limiter._min_latency_us, 150); // Should be inflated +} + +// Test: When error rate is below threshold, punishment is zero +TEST_F(AutoConcurrencyLimiterTest, BelowThresholdZeroPunishment) { + brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.2; // 20% threshold + brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; + + brpc::policy::AutoConcurrencyLimiter limiter; + + AddSamplesAndTriggerWindow(limiter, 90, 100, 10, 1000); + + // With 10% error rate < 20% threshold, punishment should be zero + // avg_latency should be close to actual success latency of 100us + ASSERT_LT(limiter._min_latency_us, 150); // Should NOT be inflated + ASSERT_GT(limiter._min_latency_us, 50); // Should be valid (around 100us) +} + +// Test: When error rate is above threshold, punishment scales linearly +TEST_F(AutoConcurrencyLimiterTest, AboveThresholdLinearScaling) { + brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.1; // 10% threshold + brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; + + brpc::policy::AutoConcurrencyLimiter limiter; + + AddSamplesAndTriggerWindow(limiter, 50, 100, 50, 1000); + + // With 50% error rate > 10% threshold: + // punish_factor = (0.5 - 0.1) / (1.0 - 0.1) = 0.4 / 0.9 = 0.444 + // failed_punish = 50 * 1000 * 1.0 * 0.444 = 22222us + // avg_latency = (22222 + 50*100) / 50 = (22222 + 5000) / 50 = 544us + // This should be inflated, but less than threshold=0 case + ASSERT_GT(limiter._min_latency_us, 200); // Should be somewhat inflated +} + +// Test: Edge case - error rate exactly at threshold +TEST_F(AutoConcurrencyLimiterTest, ExactlyAtThresholdZeroPunishment) { + brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.1; // 10% threshold + brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; + + brpc::policy::AutoConcurrencyLimiter limiter; + + AddSamplesAndTriggerWindow(limiter, 90, 100, 10, 1000); + + // At exactly threshold, punishment should be zero (boundary case) + // avg_latency should be close to actual success latency of 100us + ASSERT_LT(limiter._min_latency_us, 150); +} + +// Test: No failed requests - threshold has no effect +TEST_F(AutoConcurrencyLimiterTest, NoFailedRequestsThresholdNoEffect) { + brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.1; + brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; + + brpc::policy::AutoConcurrencyLimiter limiter; + + AddSamplesAndTriggerWindow(limiter, 100, 100, 0, 0); + + // No failed requests, so threshold logic shouldn't trigger + ASSERT_GT(limiter._min_latency_us, 0); // Should have valid latency + ASSERT_LT(limiter._min_latency_us, 150); // Should be close to 100us +} + +// Test: Compare punishment at different thresholds for same error rate +TEST_F(AutoConcurrencyLimiterTest, DifferentThresholdsDifferentPunishment) { + brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; + + // Test with threshold = 0 (original behavior) + brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0; + brpc::policy::AutoConcurrencyLimiter limiter1; + AddSamplesAndTriggerWindow(limiter1, 95, 100, 5, 1000); // 5% error rate + int64_t latency_threshold_0 = limiter1._min_latency_us; + + // Test with threshold = 0.1 (5% < 10%, in dead zone) + brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.1; + brpc::policy::AutoConcurrencyLimiter limiter2; + AddSamplesAndTriggerWindow(limiter2, 95, 100, 5, 1000); // 5% error rate + int64_t latency_threshold_10 = limiter2._min_latency_us; + + // With threshold=0, latency should be inflated + // With threshold=0.1 and 5% error rate (below threshold), latency should not be inflated + ASSERT_GT(latency_threshold_0, latency_threshold_10); +} + +// Test: Verify linear scaling formula +TEST_F(AutoConcurrencyLimiterTest, LinearScalingFormula) { + // At 90% error rate, punishment factor should be 0.889 + brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.1; + brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; + + brpc::policy::AutoConcurrencyLimiter limiter; + + AddSamplesAndTriggerWindow(limiter, 10, 100, 90, 1000); + + // With 90% error rate > 10% threshold: + // punish_factor = (0.9 - 0.1) / (1.0 - 0.1) = 0.8 / 0.9 = 0.889 + // High punishment factor, latency should be significantly inflated + ASSERT_GT(limiter._min_latency_us, 500); +} + From 5464889a142bfaf6cbd0a15c3181f08a9150f94b Mon Sep 17 00:00:00 2001 From: yanfeng Date: Mon, 9 Feb 2026 18:18:35 +0800 Subject: [PATCH 2/5] test(auto_cl): improve unit tests based on code review - Use synthetic timestamps instead of sleeping for deterministic tests - Fix trigger sample counting to preserve exact error rates - Consolidate 7 tests to 4 core tests with two-sided assertions - Add expected value range validation in assertions Co-Authored-By: Claude Opus 4.5 --- ...brpc_auto_concurrency_limiter_unittest.cpp | 137 +++++++----------- 1 file changed, 49 insertions(+), 88 deletions(-) diff --git a/test/brpc_auto_concurrency_limiter_unittest.cpp b/test/brpc_auto_concurrency_limiter_unittest.cpp index 7bece930c9..b5e2f7a35c 100644 --- a/test/brpc_auto_concurrency_limiter_unittest.cpp +++ b/test/brpc_auto_concurrency_limiter_unittest.cpp @@ -72,13 +72,17 @@ class AutoConcurrencyLimiterTest : public ::testing::Test { }; // Helper function to add samples and trigger window completion +// Uses synthetic timestamps instead of sleeping for faster, deterministic tests. +// The final successful sample is used as the trigger, so actual counts match +// succ_count/fail_count exactly (preserving intended error rates). void AddSamplesAndTriggerWindow(brpc::policy::AutoConcurrencyLimiter& limiter, int succ_count, int64_t succ_latency, int fail_count, int64_t fail_latency) { + ASSERT_GT(succ_count, 0) << "Need at least 1 success to trigger window"; int64_t now = butil::gettimeofday_us(); - // Add successful samples - for (int i = 0; i < succ_count; ++i) { + // Add successful samples (reserve one for the trigger) + for (int i = 0; i < succ_count - 1; ++i) { limiter.AddSample(0, succ_latency, now); } // Add failed samples @@ -86,123 +90,80 @@ void AddSamplesAndTriggerWindow(brpc::policy::AutoConcurrencyLimiter& limiter, limiter.AddSample(1, fail_latency, now); } - // Wait for window to expire and trigger update - bthread_usleep(brpc::policy::FLAGS_auto_cl_sample_window_size_ms * 1000 + 1000); + // Advance timestamp past window expiry instead of sleeping + int64_t after_window = now + brpc::policy::FLAGS_auto_cl_sample_window_size_ms * 1000 + 1000; - // Add one more sample to trigger window submission - limiter.AddSample(0, succ_latency, butil::gettimeofday_us()); + // Use the final success sample to trigger window submission + limiter.AddSample(0, succ_latency, after_window); } -// Test: When threshold is 0 (default), behavior is unchanged - punishment is applied +// Test 1: Backward compatibility - threshold=0 preserves original punishment behavior TEST_F(AutoConcurrencyLimiterTest, ThresholdZeroPreservesOriginalBehavior) { brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0; - brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; // Short window for testing + brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; brpc::policy::AutoConcurrencyLimiter limiter; - AddSamplesAndTriggerWindow(limiter, 90, 100, 10, 1000); - // With threshold=0, failed_punish should NOT be attenuated - // avg_latency = (10*1000 + 90*100) / 90 = (10000 + 9000) / 90 = 211us - // This is significantly inflated from the actual success latency of 100us - // _min_latency_us should reflect this inflation - ASSERT_GT(limiter._min_latency_us, 150); // Should be inflated + // 10% error rate, threshold=0 means full punishment applied + // avg_latency = (10*1000 + 90*100) / 90 = 211us + ASSERT_GT(limiter._min_latency_us, 180); + ASSERT_LT(limiter._min_latency_us, 250); } -// Test: When error rate is below threshold, punishment is zero +// Test 2: Dead zone - error rate below threshold produces zero punishment TEST_F(AutoConcurrencyLimiterTest, BelowThresholdZeroPunishment) { brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.2; // 20% threshold brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; brpc::policy::AutoConcurrencyLimiter limiter; - AddSamplesAndTriggerWindow(limiter, 90, 100, 10, 1000); - // With 10% error rate < 20% threshold, punishment should be zero - // avg_latency should be close to actual success latency of 100us - ASSERT_LT(limiter._min_latency_us, 150); // Should NOT be inflated - ASSERT_GT(limiter._min_latency_us, 50); // Should be valid (around 100us) -} - -// Test: When error rate is above threshold, punishment scales linearly -TEST_F(AutoConcurrencyLimiterTest, AboveThresholdLinearScaling) { - brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.1; // 10% threshold - brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; - - brpc::policy::AutoConcurrencyLimiter limiter; - - AddSamplesAndTriggerWindow(limiter, 50, 100, 50, 1000); - - // With 50% error rate > 10% threshold: - // punish_factor = (0.5 - 0.1) / (1.0 - 0.1) = 0.4 / 0.9 = 0.444 - // failed_punish = 50 * 1000 * 1.0 * 0.444 = 22222us - // avg_latency = (22222 + 50*100) / 50 = (22222 + 5000) / 50 = 544us - // This should be inflated, but less than threshold=0 case - ASSERT_GT(limiter._min_latency_us, 200); // Should be somewhat inflated + // 10% error rate < 20% threshold, punishment should be zero + // avg_latency = 90*100 / 90 = 100us (no inflation) + ASSERT_GT(limiter._min_latency_us, 80); + ASSERT_LT(limiter._min_latency_us, 130); } -// Test: Edge case - error rate exactly at threshold +// Test 3: Boundary - error rate exactly at threshold produces zero punishment TEST_F(AutoConcurrencyLimiterTest, ExactlyAtThresholdZeroPunishment) { brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.1; // 10% threshold brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; brpc::policy::AutoConcurrencyLimiter limiter; - AddSamplesAndTriggerWindow(limiter, 90, 100, 10, 1000); - // At exactly threshold, punishment should be zero (boundary case) - // avg_latency should be close to actual success latency of 100us - ASSERT_LT(limiter._min_latency_us, 150); + // 10% error rate == 10% threshold, punishment should be zero + // avg_latency = 90*100 / 90 = 100us + ASSERT_GT(limiter._min_latency_us, 80); + ASSERT_LT(limiter._min_latency_us, 130); } -// Test: No failed requests - threshold has no effect -TEST_F(AutoConcurrencyLimiterTest, NoFailedRequestsThresholdNoEffect) { - brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.1; - brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; - - brpc::policy::AutoConcurrencyLimiter limiter; - - AddSamplesAndTriggerWindow(limiter, 100, 100, 0, 0); - - // No failed requests, so threshold logic shouldn't trigger - ASSERT_GT(limiter._min_latency_us, 0); // Should have valid latency - ASSERT_LT(limiter._min_latency_us, 150); // Should be close to 100us -} - -// Test: Compare punishment at different thresholds for same error rate -TEST_F(AutoConcurrencyLimiterTest, DifferentThresholdsDifferentPunishment) { - brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; - - // Test with threshold = 0 (original behavior) - brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0; - brpc::policy::AutoConcurrencyLimiter limiter1; - AddSamplesAndTriggerWindow(limiter1, 95, 100, 5, 1000); // 5% error rate - int64_t latency_threshold_0 = limiter1._min_latency_us; - - // Test with threshold = 0.1 (5% < 10%, in dead zone) - brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.1; - brpc::policy::AutoConcurrencyLimiter limiter2; - AddSamplesAndTriggerWindow(limiter2, 95, 100, 5, 1000); // 5% error rate - int64_t latency_threshold_10 = limiter2._min_latency_us; - - // With threshold=0, latency should be inflated - // With threshold=0.1 and 5% error rate (below threshold), latency should not be inflated - ASSERT_GT(latency_threshold_0, latency_threshold_10); -} - -// Test: Verify linear scaling formula -TEST_F(AutoConcurrencyLimiterTest, LinearScalingFormula) { - // At 90% error rate, punishment factor should be 0.889 - brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.1; +// Test 4: Linear scaling - above threshold, punishment scales proportionally +TEST_F(AutoConcurrencyLimiterTest, AboveThresholdLinearScaling) { + brpc::policy::FLAGS_auto_cl_error_rate_punish_threshold = 0.1; // 10% threshold brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; - brpc::policy::AutoConcurrencyLimiter limiter; - - AddSamplesAndTriggerWindow(limiter, 10, 100, 90, 1000); + // Case A: 50% error rate + // punish_factor = (0.5 - 0.1) / (1.0 - 0.1) = 0.444 + // failed_punish = 50 * 1000 * 0.444 = 22222us + // avg_latency = (22222 + 50*100) / 50 = 544us + { + brpc::policy::AutoConcurrencyLimiter limiter; + AddSamplesAndTriggerWindow(limiter, 50, 100, 50, 1000); + ASSERT_GT(limiter._min_latency_us, 450); + ASSERT_LT(limiter._min_latency_us, 650); + } - // With 90% error rate > 10% threshold: - // punish_factor = (0.9 - 0.1) / (1.0 - 0.1) = 0.8 / 0.9 = 0.889 - // High punishment factor, latency should be significantly inflated - ASSERT_GT(limiter._min_latency_us, 500); + // Case B: 90% error rate (near full punishment) + // punish_factor = (0.9 - 0.1) / (1.0 - 0.1) = 0.889 + // failed_punish = 90 * 1000 * 0.889 = 80000us + // avg_latency = (80000 + 10*100) / 10 = 8100us + { + brpc::policy::AutoConcurrencyLimiter limiter; + AddSamplesAndTriggerWindow(limiter, 10, 100, 90, 1000); + ASSERT_GT(limiter._min_latency_us, 7000); + ASSERT_LT(limiter._min_latency_us, 9000); + } } From 76fdd95ef13e1ff10e36c1feab64dcf342587110 Mon Sep 17 00:00:00 2001 From: yanfeng Date: Mon, 9 Feb 2026 18:20:06 +0800 Subject: [PATCH 3/5] docs(auto_cl): add parameter configuration section - Document error punishment related GFlags - Add detailed explanation for auto_cl_error_rate_punish_threshold - Include table of all configurable parameters with defaults Co-Authored-By: Claude Opus 4.5 --- docs/cn/auto_concurrency_limiter.md | 44 +++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/docs/cn/auto_concurrency_limiter.md b/docs/cn/auto_concurrency_limiter.md index 17ef5d7ec3..8768541130 100644 --- a/docs/cn/auto_concurrency_limiter.md +++ b/docs/cn/auto_concurrency_limiter.md @@ -154,3 +154,47 @@ netflix中的gradient算法公式为:max_concurrency = min_latency / latency * * gradient算法中的max_concurrency / latency从概念上和qps有关联(根据little's law),但可能严重脱节。比如在重测 min_latency前,若所有latency都小于min_latency,那么max_concurrency会不断下降甚至到0;但按照本算法,max_qps和min_latency仍然是稳定的,它们计算出的max_concurrency也不会剧烈变动。究其本质,gradient算法在迭代max_concurrency时,latency并不能代表实际并发为max_concurrency时的延时,两者是脱节的,所以max_concurrency / latency的实际物理含义不明,与qps可能差异甚大,最后导致了很大的偏差。 * gradient算法的queue_size推荐为sqrt(max_concurrency),这是不合理的。netflix对queue_size的理解大概是代表各种不可控环节的缓存,比如socket里的,和max_concurrency存在一定的正向关系情有可原。但在我们的理解中,这部分queue_size作用微乎其微,没有或用常量即可。我们关注的queue_size是给concurrency上升留出的探索空间: max_concurrency的更新是有延迟的,在并发从低到高的增长过程中,queue_size的作用就是在max_concurrency更新前不限制qps上升。而当concurrency高时,服务可能已经过载了,queue_size就应该小一点,防止进一步恶化延时。这里的queue_size和并发是反向关系。 + +## 参数配置 + +### 错误请求惩罚 + +自适应限流在计算平均延时时,默认会将失败请求的延时也计入统计,以避免在下游服务异常时过度放大max_concurrency。相关参数如下: + +| GFlag | 默认值 | 说明 | +|-------|--------|------| +| auto_cl_enable_error_punish | true | 是否开启错误请求惩罚。关闭后失败请求不计入延时统计 | +| auto_cl_fail_punish_ratio | 1.0 | 惩罚系数。值越大惩罚越激进,失败请求对平均延时的影响越大 | +| auto_cl_error_rate_punish_threshold | 0 | 错误率惩罚阈值。见下文详细说明 | + +#### 错误率惩罚阈值 + +`auto_cl_error_rate_punish_threshold`用于设置错误率"死区",低于该阈值的错误率不会产生惩罚,避免少量错误请求对max_concurrency的过度影响。 + +- **默认值为0**:保持原有行为,所有失败请求都会产生惩罚 +- **设置为正值(如0.1)**: + - 错误率 ≤ 阈值时:惩罚为0,平均延时仅由成功请求决定 + - 错误率 > 阈值时:惩罚线性增长,从0逐步恢复到完整惩罚 + +线性衰减公式:`punish_factor = (error_rate - threshold) / (1.0 - threshold)` + +**使用场景**:当服务存在少量固有错误(如个别请求参数异常)时,这些错误不应影响对服务处理能力的判断。通过设置合理的阈值(如0.05或0.1),可以过滤掉这部分噪声。 + +**示例**: +``` +# 错误率低于10%时不惩罚,高于10%时线性增加惩罚 +--auto_cl_error_rate_punish_threshold=0.1 +``` + +### 其他参数 + +| GFlag | 默认值 | 说明 | +|-------|--------|------| +| auto_cl_sample_window_size_ms | 1000 | 采样窗口时长(毫秒) | +| auto_cl_min_sample_count | 100 | 采样窗口内的最小样本数,不足则丢弃该窗口 | +| auto_cl_max_sample_count | 200 | 采样窗口内的最大样本数,超过则提前提交窗口 | +| auto_cl_initial_max_concurrency | 40 | 初始最大并发数 | +| auto_cl_alpha_factor_for_ema | 0.1 | EMA平滑系数,值越小单次采样窗口对结果影响越小 | +| auto_cl_max_explore_ratio | 0.3 | 最大探索比例,值越大对延时波动的容忍度越高 | +| auto_cl_min_explore_ratio | 0.06 | 最小探索比例,用于判断服务负载情况 | +| auto_cl_noload_latency_remeasure_interval_ms | 50000 | 重测noload_latency的间隔(毫秒) | From 0fecd8a1e274f04d4d84d409230312fca60c7b3f Mon Sep 17 00:00:00 2001 From: yanfeng Date: Mon, 9 Feb 2026 19:50:53 +0800 Subject: [PATCH 4/5] test(auto_cl): fix comments and add Bazel target - Fix avg_latency comments to reflect std::ceil() rounding behavior - Add cc_test target in BUILD.bazel for Bazel CI coverage Co-Authored-By: Claude Opus 4.5 --- test/BUILD.bazel | 13 +++++++++++++ test/brpc_auto_concurrency_limiter_unittest.cpp | 8 ++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/test/BUILD.bazel b/test/BUILD.bazel index 05420ae310..66aef4259e 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -269,6 +269,19 @@ cc_test( ], ) +cc_test( + name = "brpc_auto_concurrency_limiter_test", + srcs = [ + "brpc_auto_concurrency_limiter_unittest.cpp", + ], + copts = COPTS, + deps = [ + "//:brpc", + "@com_google_googletest//:gtest", + "@com_google_googletest//:gtest_main", + ], +) + refresh_compile_commands( name = "brpc_test_compdb", # Specify the targets of interest. diff --git a/test/brpc_auto_concurrency_limiter_unittest.cpp b/test/brpc_auto_concurrency_limiter_unittest.cpp index b5e2f7a35c..87619b7e8f 100644 --- a/test/brpc_auto_concurrency_limiter_unittest.cpp +++ b/test/brpc_auto_concurrency_limiter_unittest.cpp @@ -106,7 +106,7 @@ TEST_F(AutoConcurrencyLimiterTest, ThresholdZeroPreservesOriginalBehavior) { AddSamplesAndTriggerWindow(limiter, 90, 100, 10, 1000); // 10% error rate, threshold=0 means full punishment applied - // avg_latency = (10*1000 + 90*100) / 90 = 211us + // avg_latency = ceil((10*1000 + 90*100) / 90) = ceil(211.1) = 212us ASSERT_GT(limiter._min_latency_us, 180); ASSERT_LT(limiter._min_latency_us, 250); } @@ -145,9 +145,9 @@ TEST_F(AutoConcurrencyLimiterTest, AboveThresholdLinearScaling) { brpc::policy::FLAGS_auto_cl_sample_window_size_ms = 10; // Case A: 50% error rate - // punish_factor = (0.5 - 0.1) / (1.0 - 0.1) = 0.444 - // failed_punish = 50 * 1000 * 0.444 = 22222us - // avg_latency = (22222 + 50*100) / 50 = 544us + // punish_factor = (0.5 - 0.1) / (1.0 - 0.1) = 4/9 ≈ 0.444 + // failed_punish = 50 * 1000 * (4/9) = 22222.2us + // avg_latency = ceil((22222.2 + 50*100) / 50) = ceil(544.4) = 545us { brpc::policy::AutoConcurrencyLimiter limiter; AddSamplesAndTriggerWindow(limiter, 50, 100, 50, 1000); From a18e2bf857a95252e7806f431d77dc2cd96bb70b Mon Sep 17 00:00:00 2001 From: yanfeng Date: Mon, 9 Feb 2026 19:58:16 +0800 Subject: [PATCH 5/5] feat(auto_cl): handle invalid threshold values gracefully - Skip attenuation logic when threshold <= 0 or >= 1 - Update GFlag description to document valid range (0, 1) - Add documentation for the new parameter Co-Authored-By: Claude Opus 4.5 --- docs/cn/auto_concurrency_limiter.md | 40 ++++--------------- src/brpc/policy/auto_concurrency_limiter.cpp | 25 ++++++------ ...brpc_auto_concurrency_limiter_unittest.cpp | 7 ++-- 3 files changed, 23 insertions(+), 49 deletions(-) diff --git a/docs/cn/auto_concurrency_limiter.md b/docs/cn/auto_concurrency_limiter.md index 8768541130..342e9ba641 100644 --- a/docs/cn/auto_concurrency_limiter.md +++ b/docs/cn/auto_concurrency_limiter.md @@ -155,46 +155,20 @@ netflix中的gradient算法公式为:max_concurrency = min_latency / latency * min_latency前,若所有latency都小于min_latency,那么max_concurrency会不断下降甚至到0;但按照本算法,max_qps和min_latency仍然是稳定的,它们计算出的max_concurrency也不会剧烈变动。究其本质,gradient算法在迭代max_concurrency时,latency并不能代表实际并发为max_concurrency时的延时,两者是脱节的,所以max_concurrency / latency的实际物理含义不明,与qps可能差异甚大,最后导致了很大的偏差。 * gradient算法的queue_size推荐为sqrt(max_concurrency),这是不合理的。netflix对queue_size的理解大概是代表各种不可控环节的缓存,比如socket里的,和max_concurrency存在一定的正向关系情有可原。但在我们的理解中,这部分queue_size作用微乎其微,没有或用常量即可。我们关注的queue_size是给concurrency上升留出的探索空间: max_concurrency的更新是有延迟的,在并发从低到高的增长过程中,queue_size的作用就是在max_concurrency更新前不限制qps上升。而当concurrency高时,服务可能已经过载了,queue_size就应该小一点,防止进一步恶化延时。这里的queue_size和并发是反向关系。 -## 参数配置 - -### 错误请求惩罚 - -自适应限流在计算平均延时时,默认会将失败请求的延时也计入统计,以避免在下游服务异常时过度放大max_concurrency。相关参数如下: - -| GFlag | 默认值 | 说明 | -|-------|--------|------| -| auto_cl_enable_error_punish | true | 是否开启错误请求惩罚。关闭后失败请求不计入延时统计 | -| auto_cl_fail_punish_ratio | 1.0 | 惩罚系数。值越大惩罚越激进,失败请求对平均延时的影响越大 | -| auto_cl_error_rate_punish_threshold | 0 | 错误率惩罚阈值。见下文详细说明 | - -#### 错误率惩罚阈值 +## 错误率惩罚阈值 `auto_cl_error_rate_punish_threshold`用于设置错误率"死区",低于该阈值的错误率不会产生惩罚,避免少量错误请求对max_concurrency的过度影响。 -- **默认值为0**:保持原有行为,所有失败请求都会产生惩罚 -- **设置为正值(如0.1)**: - - 错误率 ≤ 阈值时:惩罚为0,平均延时仅由成功请求决定 - - 错误率 > 阈值时:惩罚线性增长,从0逐步恢复到完整惩罚 - -线性衰减公式:`punish_factor = (error_rate - threshold) / (1.0 - threshold)` +| GFlag | 默认值 | 有效范围 | 说明 | +|-------|--------|----------|------| +| auto_cl_error_rate_punish_threshold | 0 | [0, 1) | 错误率惩罚阈值,0表示禁用 | -**使用场景**:当服务存在少量固有错误(如个别请求参数异常)时,这些错误不应影响对服务处理能力的判断。通过设置合理的阈值(如0.05或0.1),可以过滤掉这部分噪声。 +- **默认值为0**:禁用该功能,保持原有行为 +- **设置为有效值(如0.1)**:错误率 ≤ 阈值时惩罚为0;错误率 > 阈值时惩罚线性增长 +- **无效值处理**:≥1 的值会被忽略,等同于0 **示例**: ``` # 错误率低于10%时不惩罚,高于10%时线性增加惩罚 --auto_cl_error_rate_punish_threshold=0.1 ``` - -### 其他参数 - -| GFlag | 默认值 | 说明 | -|-------|--------|------| -| auto_cl_sample_window_size_ms | 1000 | 采样窗口时长(毫秒) | -| auto_cl_min_sample_count | 100 | 采样窗口内的最小样本数,不足则丢弃该窗口 | -| auto_cl_max_sample_count | 200 | 采样窗口内的最大样本数,超过则提前提交窗口 | -| auto_cl_initial_max_concurrency | 40 | 初始最大并发数 | -| auto_cl_alpha_factor_for_ema | 0.1 | EMA平滑系数,值越小单次采样窗口对结果影响越小 | -| auto_cl_max_explore_ratio | 0.3 | 最大探索比例,值越大对延时波动的容忍度越高 | -| auto_cl_min_explore_ratio | 0.06 | 最小探索比例,用于判断服务负载情况 | -| auto_cl_noload_latency_remeasure_interval_ms | 50000 | 重测noload_latency的间隔(毫秒) | diff --git a/src/brpc/policy/auto_concurrency_limiter.cpp b/src/brpc/policy/auto_concurrency_limiter.cpp index 220d8a1693..51ea56d765 100644 --- a/src/brpc/policy/auto_concurrency_limiter.cpp +++ b/src/brpc/policy/auto_concurrency_limiter.cpp @@ -79,12 +79,11 @@ DEFINE_int32(auto_cl_latency_fluctuation_correction_factor, 1, "when the server is overloaded."); DEFINE_double(auto_cl_error_rate_punish_threshold, 0, "Threshold for error-rate-based punishment attenuation. " - "0 (default): no effect, original punishment logic is used. " - "> 0 (e.g. 0.1): error rates below this threshold produce zero " - "punishment; above it the punishment scales linearly from 0 to " - "full strength. Only effective when auto_cl_enable_error_punish " - "is true. Example: 0.1 means error rates below 10%% are not " - "punished."); + "Valid range: [0, 1). 0 (default) disables the feature. " + "Values >= 1 are ignored and treated as 0. " + "e.g. 0.1: error rates below 10%% produce zero punishment; " + "above it the punishment scales linearly from 0 to full strength. " + "Only effective when auto_cl_enable_error_punish is true."); AutoConcurrencyLimiter::AutoConcurrencyLimiter() : _max_concurrency(FLAGS_auto_cl_initial_max_concurrency) @@ -245,12 +244,14 @@ void AutoConcurrencyLimiter::UpdateMaxConcurrency(int64_t sampling_time_us) { int32_t total_succ_req = _total_succ_req.load(butil::memory_order_relaxed); double failed_punish = _sw.total_failed_us * FLAGS_auto_cl_fail_punish_ratio; - // Threshold-based attenuation: when auto_cl_error_rate_punish_threshold > 0, - // attenuate punishment based on error rate. Inspired by Sentinel's threshold- - // based circuit breaker: low error rates should not inflate avg_latency. - // Above threshold, punishment scales linearly from 0 to full strength. - // When threshold is 0 (default), this block is skipped entirely. - if (FLAGS_auto_cl_error_rate_punish_threshold > 0 && _sw.failed_count > 0) { + // Threshold-based attenuation: when 0 < threshold < 1, attenuate punishment + // based on error rate. Inspired by Sentinel's threshold-based circuit breaker: + // low error rates should not inflate avg_latency. Above threshold, punishment + // scales linearly from 0 to full strength. + // Invalid values (<=0 or >=1) skip this block entirely, preserving original behavior. + if (FLAGS_auto_cl_error_rate_punish_threshold > 0 && + FLAGS_auto_cl_error_rate_punish_threshold < 1.0 && + _sw.failed_count > 0) { double threshold = FLAGS_auto_cl_error_rate_punish_threshold; double error_rate = static_cast(_sw.failed_count) / (_sw.succ_count + _sw.failed_count); diff --git a/test/brpc_auto_concurrency_limiter_unittest.cpp b/test/brpc_auto_concurrency_limiter_unittest.cpp index 87619b7e8f..77163e2fb8 100644 --- a/test/brpc_auto_concurrency_limiter_unittest.cpp +++ b/test/brpc_auto_concurrency_limiter_unittest.cpp @@ -17,7 +17,6 @@ #include "brpc/policy/auto_concurrency_limiter.h" #include "butil/time.h" -#include "bthread/bthread.h" #include namespace brpc { @@ -156,9 +155,9 @@ TEST_F(AutoConcurrencyLimiterTest, AboveThresholdLinearScaling) { } // Case B: 90% error rate (near full punishment) - // punish_factor = (0.9 - 0.1) / (1.0 - 0.1) = 0.889 - // failed_punish = 90 * 1000 * 0.889 = 80000us - // avg_latency = (80000 + 10*100) / 10 = 8100us + // punish_factor = (0.9 - 0.1) / (1.0 - 0.1) = 8/9 ≈ 0.889 + // failed_punish = 90 * 1000 * (8/9) = 80000us + // avg_latency = ceil((80000 + 10*100) / 10) = ceil(8100) = 8100us { brpc::policy::AutoConcurrencyLimiter limiter; AddSamplesAndTriggerWindow(limiter, 10, 100, 90, 1000);