mirror of
https://github.com/akvorado/akvorado.git
synced 2025-12-11 22:14:02 +01:00
outlet/kafka: do not wait forever to decrease workers
Some checks failed
CI / 🤖 Check dependabot status (push) Has been cancelled
CI / 🐧 Test on Linux (${{ github.ref_type == 'tag' }}, misc) (push) Has been cancelled
CI / 🐧 Test on Linux (coverage) (push) Has been cancelled
CI / 🐧 Test on Linux (regular) (push) Has been cancelled
CI / ❄️ Build on Nix (push) Has been cancelled
CI / 🍏 Build and test on macOS (push) Has been cancelled
CI / 🧪 End-to-end testing (push) Has been cancelled
CI / 🔍 Upload code coverage (push) Has been cancelled
CI / 🔬 Test only Go (push) Has been cancelled
CI / 🔬 Test only JS (${{ needs.dependabot.outputs.package-ecosystem }}, 20) (push) Has been cancelled
CI / 🔬 Test only JS (${{ needs.dependabot.outputs.package-ecosystem }}, 22) (push) Has been cancelled
CI / 🔬 Test only JS (${{ needs.dependabot.outputs.package-ecosystem }}, 24) (push) Has been cancelled
CI / ⚖️ Check licenses (push) Has been cancelled
CI / 🐋 Build Docker images (push) Has been cancelled
CI / 🐋 Tag Docker images (push) Has been cancelled
CI / 🚀 Publish release (push) Has been cancelled
Some checks failed
CI / 🤖 Check dependabot status (push) Has been cancelled
CI / 🐧 Test on Linux (${{ github.ref_type == 'tag' }}, misc) (push) Has been cancelled
CI / 🐧 Test on Linux (coverage) (push) Has been cancelled
CI / 🐧 Test on Linux (regular) (push) Has been cancelled
CI / ❄️ Build on Nix (push) Has been cancelled
CI / 🍏 Build and test on macOS (push) Has been cancelled
CI / 🧪 End-to-end testing (push) Has been cancelled
CI / 🔍 Upload code coverage (push) Has been cancelled
CI / 🔬 Test only Go (push) Has been cancelled
CI / 🔬 Test only JS (${{ needs.dependabot.outputs.package-ecosystem }}, 20) (push) Has been cancelled
CI / 🔬 Test only JS (${{ needs.dependabot.outputs.package-ecosystem }}, 22) (push) Has been cancelled
CI / 🔬 Test only JS (${{ needs.dependabot.outputs.package-ecosystem }}, 24) (push) Has been cancelled
CI / ⚖️ Check licenses (push) Has been cancelled
CI / 🐋 Build Docker images (push) Has been cancelled
CI / 🐋 Tag Docker images (push) Has been cancelled
CI / 🚀 Publish release (push) Has been cancelled
Use a time window to not give steady requests an upper hand for too long. Also, remove duplicate tests.
This commit is contained in:
@@ -91,13 +91,19 @@ func scaleWhileDraining(ctx context.Context, ch <-chan ScaleRequest, scaleFn fun
|
|||||||
wg.Wait()
|
wg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
// runScaler starts the automatic scaling loop
|
// requestRecord tracks a scale request with its timestamp.
|
||||||
|
type requestRecord struct {
|
||||||
|
request ScaleRequest
|
||||||
|
time time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
// runScaler starts the automatic scaling loop.
|
||||||
func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequest {
|
func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequest {
|
||||||
ch := make(chan ScaleRequest, config.maxWorkers)
|
ch := make(chan ScaleRequest, config.maxWorkers)
|
||||||
go func() {
|
go func() {
|
||||||
state := new(scalerState)
|
state := new(scalerState)
|
||||||
var last time.Time
|
var last time.Time
|
||||||
var decreaseCount int
|
var requestHistory []requestRecord
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
@@ -108,7 +114,8 @@ func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequ
|
|||||||
if last.Add(config.increaseRateLimit).After(now) {
|
if last.Add(config.increaseRateLimit).After(now) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Between increaseRateLimit and decreaseRateLimit, we accept increase requests.
|
// Between increaseRateLimit and decreaseRateLimit, we accept
|
||||||
|
// increase requests.
|
||||||
if request == ScaleIncrease {
|
if request == ScaleIncrease {
|
||||||
current := config.getWorkerCount()
|
current := config.getWorkerCount()
|
||||||
target := state.nextWorkerCount(ScaleIncrease, current, config.minWorkers, config.maxWorkers)
|
target := state.nextWorkerCount(ScaleIncrease, current, config.minWorkers, config.maxWorkers)
|
||||||
@@ -118,23 +125,44 @@ func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequ
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
last = time.Now()
|
last = time.Now()
|
||||||
decreaseCount = 0
|
requestHistory = requestHistory[:0]
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// We also count steady requests.
|
// Between increaseRateLimit and decreaseRateLimit, we also
|
||||||
if request == ScaleSteady {
|
// count steady requests to give them a head start.
|
||||||
decreaseCount--
|
|
||||||
}
|
|
||||||
// But we ignore everything else.
|
|
||||||
if last.Add(config.decreaseRateLimit).After(now) {
|
if last.Add(config.decreaseRateLimit).After(now) {
|
||||||
|
if request == ScaleSteady {
|
||||||
|
requestHistory = append(requestHistory, requestRecord{request, now})
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Past decreaseRateLimit, we count decrease requests and
|
// Past decreaseRateLimit, we track all requests.
|
||||||
// request 10 of them if not cancelled by steady requests (they
|
requestHistory = append(requestHistory, requestRecord{request, now})
|
||||||
// have a head start).
|
|
||||||
if request == ScaleDecrease {
|
// Remove old requests to prevent unbounded growth. We only
|
||||||
|
// consider requests from the last decreaseRateLimit duration to
|
||||||
|
// avoid accumulating requests over many hours.
|
||||||
|
windowStart := now.Add(-config.decreaseRateLimit)
|
||||||
|
i := 0
|
||||||
|
for i < len(requestHistory)-1 && requestHistory[i].time.Before(windowStart) {
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
requestHistory = requestHistory[i:]
|
||||||
|
|
||||||
|
// Count decrease vs steady requests in the window.
|
||||||
|
var decreaseCount int
|
||||||
|
var steadyCount int
|
||||||
|
for _, r := range requestHistory {
|
||||||
|
switch r.request {
|
||||||
|
case ScaleDecrease:
|
||||||
decreaseCount++
|
decreaseCount++
|
||||||
if decreaseCount >= 10 {
|
case ScaleSteady:
|
||||||
|
steadyCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scale down if we have a majority of decrease requests.
|
||||||
|
if decreaseCount > steadyCount {
|
||||||
current := config.getWorkerCount()
|
current := config.getWorkerCount()
|
||||||
target := state.nextWorkerCount(ScaleDecrease, current, config.minWorkers, config.maxWorkers)
|
target := state.nextWorkerCount(ScaleDecrease, current, config.minWorkers, config.maxWorkers)
|
||||||
if target < current {
|
if target < current {
|
||||||
@@ -143,8 +171,7 @@ func runScaler(ctx context.Context, config scalerConfiguration) chan<- ScaleRequ
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
last = time.Now()
|
last = time.Now()
|
||||||
decreaseCount = 0
|
requestHistory = requestHistory[:0]
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,124 +13,6 @@ import (
|
|||||||
"akvorado/common/helpers"
|
"akvorado/common/helpers"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestScalerWithoutRateLimiter(t *testing.T) {
|
|
||||||
for _, tc := range []struct {
|
|
||||||
name string
|
|
||||||
minWorkers int
|
|
||||||
maxWorkers int
|
|
||||||
requests []ScaleRequest
|
|
||||||
expected []int
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "scale up",
|
|
||||||
minWorkers: 1,
|
|
||||||
maxWorkers: 16,
|
|
||||||
requests: []ScaleRequest{ScaleIncrease},
|
|
||||||
expected: []int{9},
|
|
||||||
}, {
|
|
||||||
name: "scale up twice",
|
|
||||||
minWorkers: 1,
|
|
||||||
maxWorkers: 16,
|
|
||||||
requests: []ScaleRequest{ScaleIncrease, ScaleIncrease},
|
|
||||||
expected: []int{9, 13},
|
|
||||||
}, {
|
|
||||||
name: "scale up many times",
|
|
||||||
minWorkers: 1,
|
|
||||||
maxWorkers: 16,
|
|
||||||
requests: []ScaleRequest{
|
|
||||||
ScaleIncrease, ScaleIncrease, ScaleIncrease, ScaleIncrease,
|
|
||||||
ScaleIncrease, ScaleIncrease,
|
|
||||||
},
|
|
||||||
expected: []int{9, 13, 15, 16},
|
|
||||||
}, {
|
|
||||||
name: "scale up twice, then down a lot",
|
|
||||||
minWorkers: 1,
|
|
||||||
maxWorkers: 16,
|
|
||||||
requests: []ScaleRequest{
|
|
||||||
ScaleIncrease, ScaleIncrease,
|
|
||||||
// We need 10 decrease to decrease
|
|
||||||
ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease,
|
|
||||||
ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease, ScaleDecrease,
|
|
||||||
},
|
|
||||||
expected: []int{9, 13, 12},
|
|
||||||
}, {
|
|
||||||
name: "scale up twice, then down, steady, and repeat",
|
|
||||||
minWorkers: 1,
|
|
||||||
maxWorkers: 16,
|
|
||||||
requests: []ScaleRequest{
|
|
||||||
ScaleIncrease, ScaleIncrease,
|
|
||||||
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
|
|
||||||
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
|
|
||||||
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
|
|
||||||
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
|
|
||||||
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
|
|
||||||
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady,
|
|
||||||
},
|
|
||||||
expected: []int{9, 13},
|
|
||||||
}, {
|
|
||||||
name: "scale up twice, then down, steady, down, steady, down, down, repeat",
|
|
||||||
minWorkers: 1,
|
|
||||||
maxWorkers: 16,
|
|
||||||
requests: []ScaleRequest{
|
|
||||||
ScaleIncrease, ScaleIncrease,
|
|
||||||
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
|
|
||||||
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
|
|
||||||
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
|
|
||||||
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
|
|
||||||
ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleSteady, ScaleDecrease, ScaleDecrease,
|
|
||||||
},
|
|
||||||
expected: []int{9, 13, 12},
|
|
||||||
},
|
|
||||||
// No more tests, the state logic is tested in TestScalerState
|
|
||||||
} {
|
|
||||||
t.Run(tc.name, func(t *testing.T) {
|
|
||||||
synctest.Test(t, func(t *testing.T) {
|
|
||||||
ctx, cancel := context.WithCancel(t.Context())
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
var mu sync.Mutex
|
|
||||||
currentWorkers := tc.minWorkers
|
|
||||||
got := []int{}
|
|
||||||
config := scalerConfiguration{
|
|
||||||
minWorkers: tc.minWorkers,
|
|
||||||
maxWorkers: tc.maxWorkers,
|
|
||||||
increaseRateLimit: time.Second,
|
|
||||||
decreaseRateLimit: time.Second,
|
|
||||||
getWorkerCount: func() int {
|
|
||||||
mu.Lock()
|
|
||||||
defer mu.Unlock()
|
|
||||||
return currentWorkers
|
|
||||||
},
|
|
||||||
increaseWorkers: func(from, to int) {
|
|
||||||
t.Logf("increaseWorkers(from: %d, to: %d)", from, to)
|
|
||||||
mu.Lock()
|
|
||||||
defer mu.Unlock()
|
|
||||||
got = append(got, to)
|
|
||||||
currentWorkers = to
|
|
||||||
},
|
|
||||||
decreaseWorkers: func(from, to int) {
|
|
||||||
t.Logf("decreaseWorkers(from: %d, to: %d)", from, to)
|
|
||||||
mu.Lock()
|
|
||||||
defer mu.Unlock()
|
|
||||||
got = append(got, to)
|
|
||||||
currentWorkers = to
|
|
||||||
},
|
|
||||||
}
|
|
||||||
ch := runScaler(ctx, config)
|
|
||||||
for _, req := range tc.requests {
|
|
||||||
ch <- req
|
|
||||||
time.Sleep(5 * time.Second)
|
|
||||||
}
|
|
||||||
mu.Lock()
|
|
||||||
defer mu.Unlock()
|
|
||||||
if diff := helpers.Diff(got, tc.expected); diff != "" {
|
|
||||||
t.Fatalf("runScaler() (-got, +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestScalerRateLimiter(t *testing.T) {
|
func TestScalerRateLimiter(t *testing.T) {
|
||||||
synctest.Test(t, func(t *testing.T) {
|
synctest.Test(t, func(t *testing.T) {
|
||||||
ctx, cancel := context.WithCancel(t.Context())
|
ctx, cancel := context.WithCancel(t.Context())
|
||||||
@@ -201,19 +83,17 @@ func TestScalerRateLimiter(t *testing.T) {
|
|||||||
check([]int{8, 12})
|
check([]int{8, 12})
|
||||||
|
|
||||||
// Do not decrease even after 4 minutes
|
// Do not decrease even after 4 minutes
|
||||||
for range 40 {
|
for range 39 {
|
||||||
time.Sleep(6 * time.Second)
|
time.Sleep(6 * time.Second)
|
||||||
ch <- ScaleDecrease
|
ch <- ScaleDecrease
|
||||||
}
|
}
|
||||||
// time = 5 minutes
|
// time = 4m54
|
||||||
check([]int{8, 12})
|
check([]int{8, 12})
|
||||||
|
|
||||||
// Decrease (5-second timeout done)
|
// Decrease (5-second timeout done)
|
||||||
for range 10 {
|
|
||||||
time.Sleep(6 * time.Second)
|
time.Sleep(6 * time.Second)
|
||||||
ch <- ScaleDecrease
|
ch <- ScaleDecrease
|
||||||
}
|
// time = 5 minutes
|
||||||
// time = 6 minutes
|
|
||||||
check([]int{8, 12, 11})
|
check([]int{8, 12, 11})
|
||||||
|
|
||||||
// Do not increase
|
// Do not increase
|
||||||
@@ -251,26 +131,56 @@ func TestScalerRateLimiter(t *testing.T) {
|
|||||||
}
|
}
|
||||||
check([]int{8, 12, 11, 12, 13, 12})
|
check([]int{8, 12, 11, 12, 13, 12})
|
||||||
|
|
||||||
// If we have many decrease requests at once, we decrease
|
// If we have one decrease request after 5 minutes, decrease
|
||||||
time.Sleep(300 * time.Second)
|
time.Sleep(5 * time.Minute)
|
||||||
for range 10 {
|
for range 10 {
|
||||||
ch <- ScaleDecrease
|
ch <- ScaleDecrease
|
||||||
}
|
}
|
||||||
check([]int{8, 12, 11, 12, 13, 12, 11})
|
check([]int{8, 12, 11, 12, 13, 12, 11})
|
||||||
|
|
||||||
// But if they are mixed with steady requests, we shouldn't decrease
|
// But more likely, we have steady requests, then decrease requests
|
||||||
time.Sleep(300 * time.Second)
|
time.Sleep(time.Minute)
|
||||||
for range 10 {
|
for range 240 {
|
||||||
ch <- ScaleDecrease
|
time.Sleep(time.Second)
|
||||||
ch <- ScaleSteady
|
ch <- ScaleSteady
|
||||||
}
|
}
|
||||||
check([]int{8, 12, 11, 12, 13, 12, 11})
|
for range 60 {
|
||||||
|
time.Sleep(time.Second)
|
||||||
// But if we have less Steady than decrease, we should scale down
|
|
||||||
for range 10 {
|
|
||||||
ch <- ScaleDecrease
|
ch <- ScaleDecrease
|
||||||
}
|
}
|
||||||
|
// time=6m, no change (240 vs 60)
|
||||||
|
check([]int{8, 12, 11, 12, 13, 12, 11})
|
||||||
|
for range 60 {
|
||||||
|
time.Sleep(time.Second)
|
||||||
|
ch <- ScaleDecrease
|
||||||
|
}
|
||||||
|
// time=7m, no change (180 vs 120)
|
||||||
|
check([]int{8, 12, 11, 12, 13, 12, 11})
|
||||||
|
for range 30 {
|
||||||
|
time.Sleep(time.Second)
|
||||||
|
ch <- ScaleDecrease
|
||||||
|
}
|
||||||
|
// time=7m30s, no change (150 vs 150)
|
||||||
|
check([]int{8, 12, 11, 12, 13, 12, 11})
|
||||||
|
time.Sleep(time.Second)
|
||||||
|
ch <- ScaleDecrease
|
||||||
|
// OK, now more decrease than increase!
|
||||||
check([]int{8, 12, 11, 12, 13, 12, 11, 10})
|
check([]int{8, 12, 11, 12, 13, 12, 11, 10})
|
||||||
|
|
||||||
|
// We should not account for steady requests for too long!
|
||||||
|
time.Sleep(time.Minute)
|
||||||
|
for range 2400 {
|
||||||
|
time.Sleep(time.Second)
|
||||||
|
ch <- ScaleSteady
|
||||||
|
}
|
||||||
|
// 2400 vs 0
|
||||||
|
check([]int{8, 12, 11, 12, 13, 12, 11, 10})
|
||||||
|
time.Sleep(time.Second)
|
||||||
|
for range 300 {
|
||||||
|
ch <- ScaleDecrease
|
||||||
|
}
|
||||||
|
// 0 vs 300
|
||||||
|
check([]int{8, 12, 11, 12, 13, 12, 11, 10, 9})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user