outlet/kafka: prevent discarding flows on shutdown

When shutting down the outlet, the core component is first shutdown, and
only after that Kafka and ClickHouse. This means we were still consuming
messages from Kafka and throwing them away. Instead, let the core
component stop the Kafka workers before shutting down itself.

Fix #2100
This commit is contained in:
Vincent Bernat
2025-11-15 16:14:24 +01:00
parent 6b27794dd2
commit 1a7fbf66cb
8 changed files with 143 additions and 18 deletions

View File

@@ -16,6 +16,7 @@ identified with a specific icon:
ClickHouse, Kafka and remote data sources (previously, `verify` was set to
false by default)
- 🩹 *outlet*: provide additional gracetime for a worker to send to ClickHouse
- 🩹 *outlet*: prevent discarding flows on shutdown
- 🩹 *outlet*: enhance scaling up and down workers to avoid hysteresis
- 🩹 *outlet*: accept flows where interface names or descriptions are missing
- 🩹 *docker*: update Traefik to 3.6.1 (for compatibility with Docker Engine 29)

View File

@@ -101,6 +101,7 @@ func (c *Component) Stop() error {
c.r.Info().Msg("core component stopped")
}()
c.r.Info().Msg("stopping core component")
c.d.Kafka.StopWorkers()
c.t.Kill(nil)
return c.t.Wait()
}

View File

@@ -53,11 +53,6 @@ func (w *worker) shutdown() {
// processIncomingFlow processes one incoming flow from Kafka.
func (w *worker) processIncomingFlow(ctx context.Context, data []byte) error {
// Do nothing if we are shutting down
if !w.c.t.Alive() {
return kafka.ErrStopProcessing
}
// Raw flow decoding: fatal
w.c.metrics.rawFlowsReceived.Inc()
w.rawFlow.ResetVT()

View File

@@ -40,7 +40,7 @@ type ShutdownFunc func()
type WorkerBuilderFunc func(int, chan<- ScaleRequest) (ReceiveFunc, ShutdownFunc)
// NewConsumer creates a new consumer.
func (c *realComponent) NewConsumer(worker int, callback ReceiveFunc) *Consumer {
func (c *realComponent) newConsumer(worker int, callback ReceiveFunc) *Consumer {
return &Consumer{
r: c.r,
l: c.r.With().Int("worker", worker).Logger(),

View File

@@ -197,6 +197,125 @@ func TestStartSeveralWorkers(t *testing.T) {
}
}
func TestWorkerStop(t *testing.T) {
r := reporter.NewMock(t)
topicName := fmt.Sprintf("test-topic3-%d", rand.Int())
expectedTopicName := fmt.Sprintf("%s-v%d", topicName, pb.Version)
cluster, err := kfake.NewCluster(
kfake.NumBrokers(1),
kfake.SeedTopics(1, expectedTopicName),
kfake.WithLogger(kafka.NewLogger(r)),
)
if err != nil {
t.Fatalf("NewCluster() error: %v", err)
}
defer cluster.Close()
// Start the component
configuration := DefaultConfiguration()
configuration.Topic = topicName
configuration.Brokers = cluster.ListenAddrs()
configuration.FetchMaxWaitTime = 100 * time.Millisecond
configuration.ConsumerGroup = fmt.Sprintf("outlet-%d", rand.Int())
configuration.MinWorkers = 1
c, err := New(r, configuration, Dependencies{Daemon: daemon.NewMock(t)})
if err != nil {
t.Fatalf("New() error:\n%+v", err)
}
helpers.StartStop(t, c)
var last int
done := make(chan bool)
c.StartWorkers(func(int, chan<- ScaleRequest) (ReceiveFunc, ShutdownFunc) {
return func(_ context.Context, got []byte) error {
last, _ = strconv.Atoi(string(got))
return nil
}, func() {
close(done)
}
})
time.Sleep(50 * time.Millisecond)
// Start producing
producerConfiguration := kafka.DefaultConfiguration()
producerConfiguration.Brokers = cluster.ListenAddrs()
producerOpts, err := kafka.NewConfig(reporter.NewMock(t), producerConfiguration)
if err != nil {
t.Fatalf("NewConfig() error:\n%+v", err)
}
producerOpts = append(producerOpts, kgo.ProducerLinger(0))
producer, err := kgo.NewClient(producerOpts...)
if err != nil {
t.Fatalf("NewClient() error:\n%+v", err)
}
defer producer.Close()
produceCtx, cancel := context.WithCancel(t.Context())
defer cancel()
go func() {
for i := 1; ; i++ {
record := &kgo.Record{
Topic: expectedTopicName,
Value: []byte(strconv.Itoa(i)),
}
producer.ProduceSync(produceCtx, record)
time.Sleep(5 * time.Millisecond)
}
}()
// Wait a bit and stop workers
time.Sleep(500 * time.Millisecond)
c.StopWorkers()
select {
case <-done:
default:
t.Fatal("StopWorkers(): worker still running!")
}
gotMetrics := r.GetMetrics("akvorado_outlet_kafka_", "received_messages_total")
expected := map[string]string{
`received_messages_total{worker="0"}`: strconv.Itoa(last),
}
if diff := helpers.Diff(gotMetrics, expected); diff != "" {
t.Fatalf("Metrics (-got, +want):\n%s", diff)
}
// Check that if we consume from the same group, we will resume from last+1
consumerConfiguration := kafka.DefaultConfiguration()
consumerConfiguration.Brokers = cluster.ListenAddrs()
consumerOpts, err := kafka.NewConfig(reporter.NewMock(t), consumerConfiguration)
if err != nil {
t.Fatalf("NewConfig() error:\n%+v", err)
}
consumerOpts = append(consumerOpts,
kgo.ConsumerGroup(configuration.ConsumerGroup),
kgo.ConsumeTopics(expectedTopicName),
kgo.FetchMinBytes(1),
kgo.FetchMaxWait(10*time.Millisecond),
kgo.ConsumeStartOffset(kgo.NewOffset().AtStart()),
)
consumer, err := kgo.NewClient(consumerOpts...)
if err != nil {
t.Fatalf("NewClient() error:\n%+v", err)
}
defer consumer.Close()
fetches := consumer.PollFetches(t.Context())
if fetches.IsClientClosed() {
t.Fatal("PollFetches(): client is closed")
}
fetches.EachError(func(_ string, _ int32, err error) {
t.Fatalf("PollFetches() error:\n%+v", err)
})
var first int
fetches.EachRecord(func(r *kgo.Record) {
if first == 0 {
first, _ = strconv.Atoi(string(r.Value))
}
})
if last+1 != first {
t.Fatalf("PollFetches: %d -> %d", last, first)
}
}
func TestWorkerScaling(t *testing.T) {
r := reporter.NewMock(t)
topicName := fmt.Sprintf("test-topic2-%d", rand.Int())

View File

@@ -25,6 +25,7 @@ import (
// Component is the interface a Kafka consumer should implement.
type Component interface {
StartWorkers(WorkerBuilderFunc) error
StopWorkers()
Stop() error
}
@@ -171,10 +172,19 @@ func (c *realComponent) StartWorkers(workerBuilder WorkerBuilderFunc) error {
return nil
}
// StopWorkers stops all workers
func (c *realComponent) StopWorkers() {
c.workerMu.Lock()
defer c.workerMu.Unlock()
for _, worker := range c.workers {
worker.stop()
}
}
// Stop stops the Kafka component
func (c *realComponent) Stop() error {
defer func() {
c.stopAllWorkers()
c.StopWorkers()
c.kadmClientMu.Lock()
defer c.kadmClientMu.Unlock()
if c.kadmClient != nil {

View File

@@ -50,8 +50,13 @@ func (c *mockComponent) StartWorkers(workerBuilder WorkerBuilderFunc) error {
return nil
}
// StopWorkers stop all workers.
func (c *mockComponent) StopWorkers() {
close(c.incoming)
}
// Stop stops the mock component.
func (c *mockComponent) Stop() error {
close(c.incoming)
c.StopWorkers()
return nil
}

View File

@@ -55,9 +55,10 @@ func (c *realComponent) startOneWorker() error {
return err
}
callback, shutdown := c.workerBuilder(i, c.workerRequestChan)
consumer := c.NewConsumer(i, callback)
consumer := c.newConsumer(i, callback)
// Goroutine for worker
done := make(chan bool)
ctx, cancel := context.WithCancelCause(context.Background())
ctx = c.t.Context(ctx)
c.t.Go(func() error {
@@ -76,6 +77,7 @@ func (c *realComponent) startOneWorker() error {
client.CloseAllowingRebalance()
shutdown()
close(done)
}()
for {
@@ -103,6 +105,7 @@ func (c *realComponent) startOneWorker() error {
c.workers = append(c.workers, worker{
stop: func() {
cancel(ErrStopProcessing)
<-done
},
})
c.metrics.workerIncrease.Inc()
@@ -125,15 +128,6 @@ func (c *realComponent) stopOneWorker() {
c.metrics.workerDecrease.Inc()
}
// stopAllWorkers stops all workers
func (c *realComponent) stopAllWorkers() {
c.workerMu.Lock()
defer c.workerMu.Unlock()
for _, worker := range c.workers {
worker.stop()
}
}
// onPartitionsRevoked is called when partitions are revoked. We need to commit.
func (c *realComponent) onPartitionsRevoked(ctx context.Context, client *kgo.Client, _ map[string][]int32) {
if err := client.CommitMarkedOffsets(ctx); err != nil {