outlet/kafka: do not wait forever to decrease workers
Some checks failed
CI / 🤖 Check dependabot status (push) Has been cancelled
CI / 🐧 Test on Linux (${{ github.ref_type == 'tag' }}, misc) (push) Has been cancelled
CI / 🐧 Test on Linux (coverage) (push) Has been cancelled
CI / 🐧 Test on Linux (regular) (push) Has been cancelled
CI / ❄️ Build on Nix (push) Has been cancelled
CI / 🍏 Build and test on macOS (push) Has been cancelled
CI / 🧪 End-to-end testing (push) Has been cancelled
CI / 🔍 Upload code coverage (push) Has been cancelled
CI / 🔬 Test only Go (push) Has been cancelled
CI / 🔬 Test only JS (${{ needs.dependabot.outputs.package-ecosystem }}, 20) (push) Has been cancelled
CI / 🔬 Test only JS (${{ needs.dependabot.outputs.package-ecosystem }}, 22) (push) Has been cancelled
CI / 🔬 Test only JS (${{ needs.dependabot.outputs.package-ecosystem }}, 24) (push) Has been cancelled
CI / ⚖️ Check licenses (push) Has been cancelled
CI / 🐋 Build Docker images (push) Has been cancelled
CI / 🐋 Tag Docker images (push) Has been cancelled
CI / 🚀 Publish release (push) Has been cancelled

Use a time window to not give steady requests an upper hand for too
long. Also, remove duplicate tests.
This commit is contained in:
Vincent Bernat
2025-11-13 08:10:41 +01:00
parent 001fc71bb4
commit 93a6c2ef13
2 changed files with 96 additions and 159 deletions

View File

@@ -91,13 +91,19 @@ func scaleWhileDraining(ctx context.Context, ch <-chan ScaleRequest, scaleFn fun
wg.Wait() wg.Wait()
} }
// runScaler starts the automatic scaling loop // requestRecord tracks a scale request with its timestamp.
type requestRecord struct {
request ScaleRequest
time time.Time
}
// runScaler starts the automatic scaling loop.
func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequest { func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequest {
ch := make(chan ScaleRequest, config.maxWorkers) ch := make(chan ScaleRequest, config.maxWorkers)
go func() { go func() {
state := new(scalerState) state := new(scalerState)
var last time.Time var last time.Time
var decreaseCount int var requestHistory []requestRecord
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
@@ -108,7 +114,8 @@ func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequ
if last.Add(config.increaseRateLimit).After(now) { if last.Add(config.increaseRateLimit).After(now) {
continue continue
} }
// Between increaseRateLimit and decreaseRateLimit, we accept increase requests. // Between increaseRateLimit and decreaseRateLimit, we accept
// increase requests.
if request == ScaleIncrease { if request == ScaleIncrease {
current := config.getWorkerCount() current := config.getWorkerCount()
target := state.nextWorkerCount(ScaleIncrease, current, config.minWorkers, config.maxWorkers) target := state.nextWorkerCount(ScaleIncrease, current, config.minWorkers, config.maxWorkers)
@@ -118,34 +125,54 @@ func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequ
}) })
} }
last = time.Now() last = time.Now()
decreaseCount = 0 requestHistory = requestHistory[:0]
continue continue
} }
// We also count steady requests. // Between increaseRateLimit and decreaseRateLimit, we also
if request == ScaleSteady { // count steady requests to give them a head start.
decreaseCount--
}
// But we ignore everything else.
if last.Add(config.decreaseRateLimit).After(now) { if last.Add(config.decreaseRateLimit).After(now) {
if request == ScaleSteady {
requestHistory = append(requestHistory, requestRecord{request, now})
}
continue continue
} }
// Past decreaseRateLimit, we count decrease requests and // Past decreaseRateLimit, we track all requests.
// request 10 of them if not cancelled by steady requests (they requestHistory = append(requestHistory, requestRecord{request, now})
// have a head start).
if request == ScaleDecrease { // Remove old requests to prevent unbounded growth. We only
decreaseCount++ // consider requests from the last decreaseRateLimit duration to
if decreaseCount >= 10 { // avoid accumulating requests over many hours.
current := config.getWorkerCount() windowStart := now.Add(-config.decreaseRateLimit)
target := state.nextWorkerCount(ScaleDecrease, current, config.minWorkers, config.maxWorkers) i := 0
if target < current { for i < len(requestHistory)-1 && requestHistory[i].time.Before(windowStart) {
scaleWhileDraining(ctx, ch, func() { i++
config.decreaseWorkers(current, target) }
}) requestHistory = requestHistory[i:]
}
last = time.Now() // Count decrease vs steady requests in the window.
decreaseCount = 0 var decreaseCount int
var steadyCount int
for _, r := range requestHistory {
switch r.request {
case ScaleDecrease:
decreaseCount++
case ScaleSteady:
steadyCount++
} }
} }
// Scale down if we have a majority of decrease requests.
if decreaseCount > steadyCount {
current := config.getWorkerCount()
target := state.nextWorkerCount(ScaleDecrease, current, config.minWorkers, config.maxWorkers)
if target < current {
scaleWhileDraining(ctx, ch, func() {
config.decreaseWorkers(current, target)
})
}
last = time.Now()
requestHistory = requestHistory[:0]
}
} }
} }
}() }()

View File

@@ -13,124 +13,6 @@ import (
"akvorado/common/helpers" "akvorado/common/helpers"
) )
func TestScalerWithoutRateLimiter(t *testing.T) {
for _, tc := range []struct {
name string
minWorkers int
maxWorkers int
requests []ScaleRequest
expected []int
}{
{
name: "scale up",
minWorkers: 1,
maxWorkers: 16,
requests: []ScaleRequest{ScaleIncrease},
expected: []int{9},
}, {
name: "scale up twice",
minWorkers: 1,
maxWorkers: 16,
requests: []ScaleRequest{ScaleIncrease, ScaleIncrease},
expected: []int{9, 13},
}, {
name: "scale up many times",
minWorkers: 1,
maxWorkers: 16,
requests: []ScaleRequest{
ScaleIncrease, ScaleIncrease, ScaleIncrease, ScaleIncrease,
ScaleIncrease, ScaleIncrease,
},
expected: []int{9, 13, 15, 16},
}, {
name: "scale up twice, then down a lot",
minWorkers: 1,
maxWorkers: 16,
requests: []ScaleRequest{
ScaleIncrease, ScaleIncrease,
// We need 10 decrease to decrease
ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease,
ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease,
},
expected: []int{9, 13, 12},
}, {
name: "scale up twice, then down, steady, and repeat",
minWorkers: 1,
maxWorkers: 16,
requests: []ScaleRequest{
ScaleIncrease, ScaleIncrease,
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
},
expected: []int{9, 13},
}, {
name: "scale up twice, then down, steady, down, steady, down, down, repeat",
minWorkers: 1,
maxWorkers: 16,
requests: []ScaleRequest{
ScaleIncrease, ScaleIncrease,
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
},
expected: []int{9, 13, 12},
},
// No more tests, the state logic is tested in TestScalerState
} {
t.Run(tc.name, func(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
ctx, cancel := context.WithCancel(t.Context())
defer cancel()
var mu sync.Mutex
currentWorkers := tc.minWorkers
got := []int{}
config := scalerConfiguration{
minWorkers: tc.minWorkers,
maxWorkers: tc.maxWorkers,
increaseRateLimit: time.Second,
decreaseRateLimit: time.Second,
getWorkerCount: func() int {
mu.Lock()
defer mu.Unlock()
return currentWorkers
},
increaseWorkers: func(from, to int) {
t.Logf("increaseWorkers(from: %d, to: %d)", from, to)
mu.Lock()
defer mu.Unlock()
got = append(got, to)
currentWorkers = to
},
decreaseWorkers: func(from, to int) {
t.Logf("decreaseWorkers(from: %d, to: %d)", from, to)
mu.Lock()
defer mu.Unlock()
got = append(got, to)
currentWorkers = to
},
}
ch := runScaler(ctx, config)
for _, req := range tc.requests {
ch <- req
time.Sleep(5 * time.Second)
}
mu.Lock()
defer mu.Unlock()
if diff := helpers.Diff(got, tc.expected); diff != "" {
t.Fatalf("runScaler() (-got, +want):\n%s", diff)
}
})
})
}
}
func TestScalerRateLimiter(t *testing.T) { func TestScalerRateLimiter(t *testing.T) {
synctest.Test(t, func(t *testing.T) { synctest.Test(t, func(t *testing.T) {
ctx, cancel := context.WithCancel(t.Context()) ctx, cancel := context.WithCancel(t.Context())
@@ -201,19 +83,17 @@ func TestScalerRateLimiter(t *testing.T) {
check([]int{8, 12}) check([]int{8, 12})
// Do not decrease even after 4 minutes // Do not decrease even after 4 minutes
for range 40 { for range 39 {
time.Sleep(6 * time.Second) time.Sleep(6 * time.Second)
ch <- ScaleDecrease ch <- ScaleDecrease
} }
// time = 5 minutes // time = 4m54
check([]int{8, 12}) check([]int{8, 12})
// Decrease (5-second timeout done) // Decrease (5-second timeout done)
for range 10 { time.Sleep(6 * time.Second)
time.Sleep(6 * time.Second) ch <- ScaleDecrease
ch <- ScaleDecrease // time = 5 minutes
}
// time = 6 minutes
check([]int{8, 12, 11}) check([]int{8, 12, 11})
// Do not increase // Do not increase
@@ -251,26 +131,56 @@ func TestScalerRateLimiter(t *testing.T) {
} }
check([]int{8, 12, 11, 12, 13, 12}) check([]int{8, 12, 11, 12, 13, 12})
// If we have many decrease requests at once, we decrease // If we have one decrease request after 5 minutes, decrease
time.Sleep(300 * time.Second) time.Sleep(5 * time.Minute)
for range 10 { for range 10 {
ch <- ScaleDecrease ch <- ScaleDecrease
} }
check([]int{8, 12, 11, 12, 13, 12, 11}) check([]int{8, 12, 11, 12, 13, 12, 11})
// But if they are mixed with steady requests, we shouldn't decrease // But more likely, we have steady requests, then decrease requests
time.Sleep(300 * time.Second) time.Sleep(time.Minute)
for range 10 { for range 240 {
ch <- ScaleDecrease time.Sleep(time.Second)
ch <- ScaleSteady ch <- ScaleSteady
} }
check([]int{8, 12, 11, 12, 13, 12, 11}) for range 60 {
time.Sleep(time.Second)
// But if we have less Steady than decrease, we should scale down
for range 10 {
ch <- ScaleDecrease ch <- ScaleDecrease
} }
// time=6m, no change (240 vs 60)
check([]int{8, 12, 11, 12, 13, 12, 11})
for range 60 {
time.Sleep(time.Second)
ch <- ScaleDecrease
}
// time=7m, no change (180 vs 120)
check([]int{8, 12, 11, 12, 13, 12, 11})
for range 30 {
time.Sleep(time.Second)
ch <- ScaleDecrease
}
// time=7m30s, no change (150 vs 150)
check([]int{8, 12, 11, 12, 13, 12, 11})
time.Sleep(time.Second)
ch <- ScaleDecrease
// OK, now more decrease than increase!
check([]int{8, 12, 11, 12, 13, 12, 11, 10}) check([]int{8, 12, 11, 12, 13, 12, 11, 10})
// We should not account for steady requests for too long!
time.Sleep(time.Minute)
for range 2400 {
time.Sleep(time.Second)
ch <- ScaleSteady
}
// 2400 vs 0
check([]int{8, 12, 11, 12, 13, 12, 11, 10})
time.Sleep(time.Second)
for range 300 {
ch <- ScaleDecrease
}
// 0 vs 300
check([]int{8, 12, 11, 12, 13, 12, 11, 10, 9})
}) })
} }