mirror of
https://github.com/akvorado/akvorado.git
synced 2025-12-11 22:14:02 +01:00
outlet/kafka: prevent discarding flows on shutdown
When shutting down the outlet, the core component is first shutdown, and only after that Kafka and ClickHouse. This means we were still consuming messages from Kafka and throwing them away. Instead, let the core component stop the Kafka workers before shutting down itself. Fix #2100
This commit is contained in:
@@ -16,6 +16,7 @@ identified with a specific icon:
|
||||
ClickHouse, Kafka and remote data sources (previously, `verify` was set to
|
||||
false by default)
|
||||
- 🩹 *outlet*: provide additional gracetime for a worker to send to ClickHouse
|
||||
- 🩹 *outlet*: prevent discarding flows on shutdown
|
||||
- 🩹 *outlet*: enhance scaling up and down workers to avoid hysteresis
|
||||
- 🩹 *outlet*: accept flows where interface names or descriptions are missing
|
||||
- 🩹 *docker*: update Traefik to 3.6.1 (for compatibility with Docker Engine 29)
|
||||
|
||||
@@ -101,6 +101,7 @@ func (c *Component) Stop() error {
|
||||
c.r.Info().Msg("core component stopped")
|
||||
}()
|
||||
c.r.Info().Msg("stopping core component")
|
||||
c.d.Kafka.StopWorkers()
|
||||
c.t.Kill(nil)
|
||||
return c.t.Wait()
|
||||
}
|
||||
|
||||
@@ -53,11 +53,6 @@ func (w *worker) shutdown() {
|
||||
|
||||
// processIncomingFlow processes one incoming flow from Kafka.
|
||||
func (w *worker) processIncomingFlow(ctx context.Context, data []byte) error {
|
||||
// Do nothing if we are shutting down
|
||||
if !w.c.t.Alive() {
|
||||
return kafka.ErrStopProcessing
|
||||
}
|
||||
|
||||
// Raw flow decoding: fatal
|
||||
w.c.metrics.rawFlowsReceived.Inc()
|
||||
w.rawFlow.ResetVT()
|
||||
|
||||
@@ -40,7 +40,7 @@ type ShutdownFunc func()
|
||||
type WorkerBuilderFunc func(int, chan<- ScaleRequest) (ReceiveFunc, ShutdownFunc)
|
||||
|
||||
// NewConsumer creates a new consumer.
|
||||
func (c *realComponent) NewConsumer(worker int, callback ReceiveFunc) *Consumer {
|
||||
func (c *realComponent) newConsumer(worker int, callback ReceiveFunc) *Consumer {
|
||||
return &Consumer{
|
||||
r: c.r,
|
||||
l: c.r.With().Int("worker", worker).Logger(),
|
||||
|
||||
@@ -197,6 +197,125 @@ func TestStartSeveralWorkers(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkerStop(t *testing.T) {
|
||||
r := reporter.NewMock(t)
|
||||
topicName := fmt.Sprintf("test-topic3-%d", rand.Int())
|
||||
expectedTopicName := fmt.Sprintf("%s-v%d", topicName, pb.Version)
|
||||
|
||||
cluster, err := kfake.NewCluster(
|
||||
kfake.NumBrokers(1),
|
||||
kfake.SeedTopics(1, expectedTopicName),
|
||||
kfake.WithLogger(kafka.NewLogger(r)),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("NewCluster() error: %v", err)
|
||||
}
|
||||
defer cluster.Close()
|
||||
|
||||
// Start the component
|
||||
configuration := DefaultConfiguration()
|
||||
configuration.Topic = topicName
|
||||
configuration.Brokers = cluster.ListenAddrs()
|
||||
configuration.FetchMaxWaitTime = 100 * time.Millisecond
|
||||
configuration.ConsumerGroup = fmt.Sprintf("outlet-%d", rand.Int())
|
||||
configuration.MinWorkers = 1
|
||||
c, err := New(r, configuration, Dependencies{Daemon: daemon.NewMock(t)})
|
||||
if err != nil {
|
||||
t.Fatalf("New() error:\n%+v", err)
|
||||
}
|
||||
helpers.StartStop(t, c)
|
||||
|
||||
var last int
|
||||
done := make(chan bool)
|
||||
c.StartWorkers(func(int, chan<- ScaleRequest) (ReceiveFunc, ShutdownFunc) {
|
||||
return func(_ context.Context, got []byte) error {
|
||||
last, _ = strconv.Atoi(string(got))
|
||||
return nil
|
||||
}, func() {
|
||||
close(done)
|
||||
}
|
||||
})
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
// Start producing
|
||||
producerConfiguration := kafka.DefaultConfiguration()
|
||||
producerConfiguration.Brokers = cluster.ListenAddrs()
|
||||
producerOpts, err := kafka.NewConfig(reporter.NewMock(t), producerConfiguration)
|
||||
if err != nil {
|
||||
t.Fatalf("NewConfig() error:\n%+v", err)
|
||||
}
|
||||
producerOpts = append(producerOpts, kgo.ProducerLinger(0))
|
||||
producer, err := kgo.NewClient(producerOpts...)
|
||||
if err != nil {
|
||||
t.Fatalf("NewClient() error:\n%+v", err)
|
||||
}
|
||||
defer producer.Close()
|
||||
produceCtx, cancel := context.WithCancel(t.Context())
|
||||
defer cancel()
|
||||
go func() {
|
||||
for i := 1; ; i++ {
|
||||
record := &kgo.Record{
|
||||
Topic: expectedTopicName,
|
||||
Value: []byte(strconv.Itoa(i)),
|
||||
}
|
||||
producer.ProduceSync(produceCtx, record)
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait a bit and stop workers
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
c.StopWorkers()
|
||||
select {
|
||||
case <-done:
|
||||
default:
|
||||
t.Fatal("StopWorkers(): worker still running!")
|
||||
}
|
||||
gotMetrics := r.GetMetrics("akvorado_outlet_kafka_", "received_messages_total")
|
||||
expected := map[string]string{
|
||||
`received_messages_total{worker="0"}`: strconv.Itoa(last),
|
||||
}
|
||||
if diff := helpers.Diff(gotMetrics, expected); diff != "" {
|
||||
t.Fatalf("Metrics (-got, +want):\n%s", diff)
|
||||
}
|
||||
|
||||
// Check that if we consume from the same group, we will resume from last+1
|
||||
consumerConfiguration := kafka.DefaultConfiguration()
|
||||
consumerConfiguration.Brokers = cluster.ListenAddrs()
|
||||
consumerOpts, err := kafka.NewConfig(reporter.NewMock(t), consumerConfiguration)
|
||||
if err != nil {
|
||||
t.Fatalf("NewConfig() error:\n%+v", err)
|
||||
}
|
||||
consumerOpts = append(consumerOpts,
|
||||
kgo.ConsumerGroup(configuration.ConsumerGroup),
|
||||
kgo.ConsumeTopics(expectedTopicName),
|
||||
kgo.FetchMinBytes(1),
|
||||
kgo.FetchMaxWait(10*time.Millisecond),
|
||||
kgo.ConsumeStartOffset(kgo.NewOffset().AtStart()),
|
||||
)
|
||||
consumer, err := kgo.NewClient(consumerOpts...)
|
||||
if err != nil {
|
||||
t.Fatalf("NewClient() error:\n%+v", err)
|
||||
}
|
||||
defer consumer.Close()
|
||||
fetches := consumer.PollFetches(t.Context())
|
||||
if fetches.IsClientClosed() {
|
||||
t.Fatal("PollFetches(): client is closed")
|
||||
}
|
||||
fetches.EachError(func(_ string, _ int32, err error) {
|
||||
t.Fatalf("PollFetches() error:\n%+v", err)
|
||||
})
|
||||
var first int
|
||||
fetches.EachRecord(func(r *kgo.Record) {
|
||||
if first == 0 {
|
||||
first, _ = strconv.Atoi(string(r.Value))
|
||||
}
|
||||
})
|
||||
if last+1 != first {
|
||||
t.Fatalf("PollFetches: %d -> %d", last, first)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkerScaling(t *testing.T) {
|
||||
r := reporter.NewMock(t)
|
||||
topicName := fmt.Sprintf("test-topic2-%d", rand.Int())
|
||||
|
||||
@@ -25,6 +25,7 @@ import (
|
||||
// Component is the interface a Kafka consumer should implement.
|
||||
type Component interface {
|
||||
StartWorkers(WorkerBuilderFunc) error
|
||||
StopWorkers()
|
||||
Stop() error
|
||||
}
|
||||
|
||||
@@ -171,10 +172,19 @@ func (c *realComponent) StartWorkers(workerBuilder WorkerBuilderFunc) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// StopWorkers stops all workers
|
||||
func (c *realComponent) StopWorkers() {
|
||||
c.workerMu.Lock()
|
||||
defer c.workerMu.Unlock()
|
||||
for _, worker := range c.workers {
|
||||
worker.stop()
|
||||
}
|
||||
}
|
||||
|
||||
// Stop stops the Kafka component
|
||||
func (c *realComponent) Stop() error {
|
||||
defer func() {
|
||||
c.stopAllWorkers()
|
||||
c.StopWorkers()
|
||||
c.kadmClientMu.Lock()
|
||||
defer c.kadmClientMu.Unlock()
|
||||
if c.kadmClient != nil {
|
||||
|
||||
@@ -50,8 +50,13 @@ func (c *mockComponent) StartWorkers(workerBuilder WorkerBuilderFunc) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// StopWorkers stop all workers.
|
||||
func (c *mockComponent) StopWorkers() {
|
||||
close(c.incoming)
|
||||
}
|
||||
|
||||
// Stop stops the mock component.
|
||||
func (c *mockComponent) Stop() error {
|
||||
close(c.incoming)
|
||||
c.StopWorkers()
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -55,9 +55,10 @@ func (c *realComponent) startOneWorker() error {
|
||||
return err
|
||||
}
|
||||
callback, shutdown := c.workerBuilder(i, c.workerRequestChan)
|
||||
consumer := c.NewConsumer(i, callback)
|
||||
consumer := c.newConsumer(i, callback)
|
||||
|
||||
// Goroutine for worker
|
||||
done := make(chan bool)
|
||||
ctx, cancel := context.WithCancelCause(context.Background())
|
||||
ctx = c.t.Context(ctx)
|
||||
c.t.Go(func() error {
|
||||
@@ -76,6 +77,7 @@ func (c *realComponent) startOneWorker() error {
|
||||
client.CloseAllowingRebalance()
|
||||
|
||||
shutdown()
|
||||
close(done)
|
||||
}()
|
||||
|
||||
for {
|
||||
@@ -103,6 +105,7 @@ func (c *realComponent) startOneWorker() error {
|
||||
c.workers = append(c.workers, worker{
|
||||
stop: func() {
|
||||
cancel(ErrStopProcessing)
|
||||
<-done
|
||||
},
|
||||
})
|
||||
c.metrics.workerIncrease.Inc()
|
||||
@@ -125,15 +128,6 @@ func (c *realComponent) stopOneWorker() {
|
||||
c.metrics.workerDecrease.Inc()
|
||||
}
|
||||
|
||||
// stopAllWorkers stops all workers
|
||||
func (c *realComponent) stopAllWorkers() {
|
||||
c.workerMu.Lock()
|
||||
defer c.workerMu.Unlock()
|
||||
for _, worker := range c.workers {
|
||||
worker.stop()
|
||||
}
|
||||
}
|
||||
|
||||
// onPartitionsRevoked is called when partitions are revoked. We need to commit.
|
||||
func (c *realComponent) onPartitionsRevoked(ctx context.Context, client *kgo.Client, _ map[string][]int32) {
|
||||
if err := client.CommitMarkedOffsets(ctx); err != nil {
|
||||
|
||||
Reference in New Issue
Block a user