outlet/kafka: do not wait forever to decrease workers

Use a time window to not give steady requests an upper hand for too long. Also, remove duplicate tests.
2025-12-11 22:14:02 +01:00 · 2025-11-13 08:10:41 +01:00
parent 001fc71bb4
commit 93a6c2ef13
2 changed files with 96 additions and 159 deletions
--- a/outlet/kafka/scaler.go
+++ b/outlet/kafka/scaler.go
@@ -91,13 +91,19 @@ func scaleWhileDraining(ctx context.Context, ch <-chan ScaleRequest, scaleFn fun
 	wg.Wait()
 }
-// runScaler starts the automatic scaling loop
+// requestRecord tracks a scale request with its timestamp.
 type requestRecord struct {
 	request ScaleRequest
 	time    time.Time
 }
 // runScaler starts the automatic scaling loop.
 func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequest {
 	ch := make(chan ScaleRequest, config.maxWorkers)
 	go func() {
 		state := new(scalerState)
 		var last time.Time
-		var decreaseCount int
+		var requestHistory []requestRecord
 		for {
 			select {
 			case <-ctx.Done():
@@ -108,7 +114,8 @@ func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequ
 				if last.Add(config.increaseRateLimit).After(now) {
 					continue
 				}
-				// Between increaseRateLimit and decreaseRateLimit, we accept increase requests.
+				// Between increaseRateLimit and decreaseRateLimit, we accept
 				// increase requests.
 				if request == ScaleIncrease {
 					current := config.getWorkerCount()
 					target := state.nextWorkerCount(ScaleIncrease, current, config.minWorkers, config.maxWorkers)
@@ -118,34 +125,54 @@ func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequ
 						})
 					}
 					last = time.Now()
-					decreaseCount = 0
+					requestHistory = requestHistory[:0]
 					continue
 				}
-				// We also count steady requests.
+				// Between increaseRateLimit and decreaseRateLimit, we also
-				if request == ScaleSteady {
+				// count steady requests to give them a head start.
 					decreaseCount--
 				}
 				// But we ignore everything else.
 				if last.Add(config.decreaseRateLimit).After(now) {
 					if request == ScaleSteady {
 						requestHistory = append(requestHistory, requestRecord{request, now})
 					}
 					continue
 				}
-				// Past decreaseRateLimit, we count decrease requests and
+				// Past decreaseRateLimit, we track all requests.
-				// request 10 of them if not cancelled by steady requests (they
+				requestHistory = append(requestHistory, requestRecord{request, now})
-				// have a head start).
+
-				if request == ScaleDecrease {
+				// Remove old requests to prevent unbounded growth. We only
-					decreaseCount++
+				// consider requests from the last decreaseRateLimit duration to
-					if decreaseCount >= 10 {
+				// avoid accumulating requests over many hours.
-						current := config.getWorkerCount()
+				windowStart := now.Add(-config.decreaseRateLimit)
-						target := state.nextWorkerCount(ScaleDecrease, current, config.minWorkers, config.maxWorkers)
+				i := 0
-						if target < current {
+				for i < len(requestHistory)-1 && requestHistory[i].time.Before(windowStart) {
-							scaleWhileDraining(ctx, ch, func() {
+					i++
-								config.decreaseWorkers(current, target)
+				}
-							})
+				requestHistory = requestHistory[i:]
-						}
+
-						last = time.Now()
+				// Count decrease vs steady requests in the window.
-						decreaseCount = 0
+				var decreaseCount int
 				var steadyCount int
 				for _, r := range requestHistory {
 					switch r.request {
 					case ScaleDecrease:
 						decreaseCount++
 					case ScaleSteady:
 						steadyCount++
 					}
 				}
 				// Scale down if we have a majority of decrease requests.
 				if decreaseCount > steadyCount {
 					current := config.getWorkerCount()
 					target := state.nextWorkerCount(ScaleDecrease, current, config.minWorkers, config.maxWorkers)
 					if target < current {
 						scaleWhileDraining(ctx, ch, func() {
 							config.decreaseWorkers(current, target)
 						})
 					}
 					last = time.Now()
 					requestHistory = requestHistory[:0]
 				}
 			}
 		}
 	}()
--- a/outlet/kafka/scaler_test.go
+++ b/outlet/kafka/scaler_test.go
@@ -13,124 +13,6 @@ import (
 	"akvorado/common/helpers"
 )
 func TestScalerWithoutRateLimiter(t *testing.T) {
 	for _, tc := range []struct {
 		name       string
 		minWorkers int
 		maxWorkers int
 		requests   []ScaleRequest
 		expected   []int
 	}{
 		{
 			name:       "scale up",
 			minWorkers: 1,
 			maxWorkers: 16,
 			requests:   []ScaleRequest{ScaleIncrease},
 			expected:   []int{9},
 		}, {
 			name:       "scale up twice",
 			minWorkers: 1,
 			maxWorkers: 16,
 			requests:   []ScaleRequest{ScaleIncrease, ScaleIncrease},
 			expected:   []int{9, 13},
 		}, {
 			name:       "scale up many times",
 			minWorkers: 1,
 			maxWorkers: 16,
 			requests: []ScaleRequest{
 				ScaleIncrease, ScaleIncrease, ScaleIncrease, ScaleIncrease,
 				ScaleIncrease, ScaleIncrease,
 			},
 			expected: []int{9, 13, 15, 16},
 		}, {
 			name:       "scale up twice, then down a lot",
 			minWorkers: 1,
 			maxWorkers: 16,
 			requests: []ScaleRequest{
 				ScaleIncrease, ScaleIncrease,
 				// We need 10 decrease to decrease
 				ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease,
 				ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease,
 			},
 			expected: []int{9, 13, 12},
 		}, {
 			name:       "scale up twice, then down, steady, and repeat",
 			minWorkers: 1,
 			maxWorkers: 16,
 			requests: []ScaleRequest{
 				ScaleIncrease, ScaleIncrease,
 				ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
 				ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
 				ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
 				ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
 				ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
 				ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
 			},
 			expected: []int{9, 13},
 		}, {
 			name:       "scale up twice, then down, steady, down, steady, down, down, repeat",
 			minWorkers: 1,
 			maxWorkers: 16,
 			requests: []ScaleRequest{
 				ScaleIncrease, ScaleIncrease,
 				ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
 				ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
 				ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
 				ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
 				ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
 			},
 			expected: []int{9, 13, 12},
 		},
 		// No more tests, the state logic is tested in TestScalerState
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			synctest.Test(t, func(t *testing.T) {
 				ctx, cancel := context.WithCancel(t.Context())
 				defer cancel()
 				var mu sync.Mutex
 				currentWorkers := tc.minWorkers
 				got := []int{}
 				config := scalerConfiguration{
 					minWorkers:        tc.minWorkers,
 					maxWorkers:        tc.maxWorkers,
 					increaseRateLimit: time.Second,
 					decreaseRateLimit: time.Second,
 					getWorkerCount: func() int {
 						mu.Lock()
 						defer mu.Unlock()
 						return currentWorkers
 					},
 					increaseWorkers: func(from, to int) {
 						t.Logf("increaseWorkers(from: %d, to: %d)", from, to)
 						mu.Lock()
 						defer mu.Unlock()
 						got = append(got, to)
 						currentWorkers = to
 					},
 					decreaseWorkers: func(from, to int) {
 						t.Logf("decreaseWorkers(from: %d, to: %d)", from, to)
 						mu.Lock()
 						defer mu.Unlock()
 						got = append(got, to)
 						currentWorkers = to
 					},
 				}
 				ch := runScaler(ctx, config)
 				for _, req := range tc.requests {
 					ch <- req
 					time.Sleep(5 * time.Second)
 				}
 				mu.Lock()
 				defer mu.Unlock()
 				if diff := helpers.Diff(got, tc.expected); diff != "" {
 					t.Fatalf("runScaler() (-got, +want):\n%s", diff)
 				}
 			})
 		})
 	}
 }
 func TestScalerRateLimiter(t *testing.T) {
 	synctest.Test(t, func(t *testing.T) {
 		ctx, cancel := context.WithCancel(t.Context())
@@ -201,19 +83,17 @@ func TestScalerRateLimiter(t *testing.T) {
 		check([]int{8, 12})
 		// Do not decrease even after 4 minutes
-		for range 40 {
+		for range 39 {
 			time.Sleep(6 * time.Second)
 			ch <- ScaleDecrease
 		}
-		// time = 5 minutes
+		// time = 4m54
 		check([]int{8, 12})
 		// Decrease (5-second timeout done)
-		for range 10 {
+		time.Sleep(6 * time.Second)
-			time.Sleep(6 * time.Second)
+		ch <- ScaleDecrease
-			ch <- ScaleDecrease
+		// time = 5 minutes
 		}
 		// time = 6 minutes
 		check([]int{8, 12, 11})
 		// Do not increase
@@ -251,26 +131,56 @@ func TestScalerRateLimiter(t *testing.T) {
 		}
 		check([]int{8, 12, 11, 12, 13, 12})
-		// If we have many decrease requests at once, we decrease
+		// If we have one decrease request after 5 minutes, decrease
-		time.Sleep(300 * time.Second)
+		time.Sleep(5 * time.Minute)
 		for range 10 {
 			ch <- ScaleDecrease
 		}
 		check([]int{8, 12, 11, 12, 13, 12, 11})
-		// But if they are mixed with steady requests, we shouldn't decrease
+		// But more likely, we have steady requests, then decrease requests
-		time.Sleep(300 * time.Second)
+		time.Sleep(time.Minute)
-		for range 10 {
+		for range 240 {
-			ch <- ScaleDecrease
+			time.Sleep(time.Second)
 			ch <- ScaleSteady
 		}
-		check([]int{8, 12, 11, 12, 13, 12, 11})
+		for range 60 {
-
+			time.Sleep(time.Second)
 		// But if we have less Steady than decrease, we should scale down
 		for range 10 {
 			ch <- ScaleDecrease
 		}
 		// time=6m, no change (240 vs 60)
 		check([]int{8, 12, 11, 12, 13, 12, 11})
 		for range 60 {
 			time.Sleep(time.Second)
 			ch <- ScaleDecrease
 		}
 		// time=7m, no change (180 vs 120)
 		check([]int{8, 12, 11, 12, 13, 12, 11})
 		for range 30 {
 			time.Sleep(time.Second)
 			ch <- ScaleDecrease
 		}
 		// time=7m30s, no change (150 vs 150)
 		check([]int{8, 12, 11, 12, 13, 12, 11})
 		time.Sleep(time.Second)
 		ch <- ScaleDecrease
 		// OK, now more decrease than increase!
 		check([]int{8, 12, 11, 12, 13, 12, 11, 10})
 		// We should not account for steady requests for too long!
 		time.Sleep(time.Minute)
 		for range 2400 {
 			time.Sleep(time.Second)
 			ch <- ScaleSteady
 		}
 		// 2400 vs 0
 		check([]int{8, 12, 11, 12, 13, 12, 11, 10})
 		time.Sleep(time.Second)
 		for range 300 {
 			ch <- ScaleDecrease
 		}
 		// 0 vs 300
 		check([]int{8, 12, 11, 12, 13, 12, 11, 10, 9})
 	})
 }