mirror of
https://github.com/akvorado/akvorado.git
synced 2025-12-11 22:14:02 +01:00
reporter: factorize how we use channels for healthchecking
Add two to the snmp component. Other components are not interesting.
This commit is contained in:
60
core/root.go
60
core/root.go
@@ -2,7 +2,6 @@
|
||||
package core
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
@@ -33,8 +32,8 @@ type Component struct {
|
||||
|
||||
metrics metrics
|
||||
|
||||
healthy chan chan<- bool // for healthcheck
|
||||
httpFlowClients uint32 // for dumping flows
|
||||
healthy chan reporter.ChannelHealthcheckFunc
|
||||
httpFlowClients uint32 // for dumping flows
|
||||
httpFlowChannel chan *flow.FlowMessage
|
||||
httpFlowFlushDelay time.Duration
|
||||
|
||||
@@ -53,7 +52,7 @@ type Dependencies struct {
|
||||
}
|
||||
|
||||
// New creates a new core component.
|
||||
func New(reporter *reporter.Reporter, configuration Configuration, dependencies Dependencies) (*Component, error) {
|
||||
func New(r *reporter.Reporter, configuration Configuration, dependencies Dependencies) (*Component, error) {
|
||||
cache, err := ristretto.NewCache(&ristretto.Config{
|
||||
NumCounters: int64(configuration.ClassifierCacheSize) * 10,
|
||||
MaxCost: int64(configuration.ClassifierCacheSize),
|
||||
@@ -64,11 +63,11 @@ func New(reporter *reporter.Reporter, configuration Configuration, dependencies
|
||||
return nil, fmt.Errorf("cannot initialize classifier cache: %w", err)
|
||||
}
|
||||
c := Component{
|
||||
r: reporter,
|
||||
r: r,
|
||||
d: &dependencies,
|
||||
config: configuration,
|
||||
|
||||
healthy: make(chan chan<- bool),
|
||||
healthy: make(chan reporter.ChannelHealthcheckFunc),
|
||||
httpFlowClients: 0,
|
||||
httpFlowChannel: make(chan *flow.FlowMessage, 10),
|
||||
httpFlowFlushDelay: time.Second,
|
||||
@@ -91,7 +90,7 @@ func (c *Component) Start() error {
|
||||
})
|
||||
}
|
||||
|
||||
c.r.RegisterHealthcheck("core", c.runHealthcheck)
|
||||
c.r.RegisterHealthcheck("core", c.channelHealthcheck())
|
||||
c.d.HTTP.AddHandler("/api/v0/flows", c.FlowsHTTPHandler())
|
||||
return nil
|
||||
}
|
||||
@@ -108,8 +107,10 @@ func (c *Component) runWorker(workerID int) error {
|
||||
case <-c.t.Dying():
|
||||
c.r.Debug().Int("worker", workerID).Msg("stopping core worker")
|
||||
return nil
|
||||
case answerChan := <-c.healthy:
|
||||
answerChan <- true
|
||||
case cb := <-c.healthy:
|
||||
if cb != nil {
|
||||
cb(reporter.HealthcheckOK, fmt.Sprintf("worker %d ok", workerID))
|
||||
}
|
||||
case flow := <-c.d.Flow.Flows():
|
||||
startBusy := time.Now()
|
||||
if flow == nil {
|
||||
@@ -167,43 +168,6 @@ func (c *Component) Stop() error {
|
||||
return c.t.Wait()
|
||||
}
|
||||
|
||||
func (c *Component) runHealthcheck(ctx context.Context) reporter.HealthcheckResult {
|
||||
say := func(reason string) reporter.HealthcheckResult {
|
||||
if reason == "" {
|
||||
return reporter.HealthcheckResult{
|
||||
Status: reporter.HealthcheckOK,
|
||||
Reason: "ok",
|
||||
}
|
||||
}
|
||||
return reporter.HealthcheckResult{Status: reporter.HealthcheckError, Reason: reason}
|
||||
}
|
||||
|
||||
if !c.t.Alive() {
|
||||
return say("dead")
|
||||
}
|
||||
|
||||
// Request a worker to answer
|
||||
answerChan := make(chan bool)
|
||||
defer close(answerChan)
|
||||
select {
|
||||
case <-c.t.Dying():
|
||||
return say("dying")
|
||||
case <-ctx.Done():
|
||||
return say("timeout (no worker)")
|
||||
case c.healthy <- answerChan:
|
||||
}
|
||||
|
||||
// Wait for answer from worker
|
||||
select {
|
||||
case <-c.t.Dying():
|
||||
return say("dying")
|
||||
case <-ctx.Done():
|
||||
return say("timeout (worker dead)")
|
||||
case ok := <-answerChan:
|
||||
if !ok {
|
||||
// Cannot happen
|
||||
return say("worker unwell")
|
||||
}
|
||||
return say("")
|
||||
}
|
||||
func (c *Component) channelHealthcheck() reporter.HealthcheckFunc {
|
||||
return reporter.ChannelHealthcheck(c.t.Context(nil), c.healthy)
|
||||
}
|
||||
|
||||
@@ -197,10 +197,10 @@ func TestCore(t *testing.T) {
|
||||
|
||||
// Test the healthcheck function
|
||||
t.Run("healthcheck", func(t *testing.T) {
|
||||
got := c.runHealthcheck(context.Background())
|
||||
if diff := helpers.Diff(got, reporter.HealthcheckResult{
|
||||
_, got := r.RunHealthchecks(context.Background())
|
||||
if diff := helpers.Diff(got["core"], reporter.HealthcheckResult{
|
||||
reporter.HealthcheckOK,
|
||||
"ok",
|
||||
"worker 0 ok",
|
||||
}); diff != "" {
|
||||
t.Fatalf("runHealthcheck() (-got, +want):\n%s", diff)
|
||||
}
|
||||
|
||||
@@ -95,7 +95,15 @@ names with the module name.
|
||||
|
||||
It also exposes a simple way to report healthchecks from various
|
||||
components. While it could be used to kill the application
|
||||
proactively, currently, it is only exposed through HTTP.
|
||||
proactively, currently, it is only exposed through HTTP. Not all
|
||||
components have healthchecks. For example, for the `flow` component,
|
||||
it is difficult to read from UDP while watching for a check. For the
|
||||
`http` component, the healthcheck would be too trivial (not in the
|
||||
routine handling the heavy work). For `kafka`, the hard work is hidden
|
||||
by the underlying library and we wouldn't want to be declared
|
||||
unhealthy because of a transient problem by checking broker states
|
||||
manually. The `daemon` component tracks the important goroutines, so it
|
||||
is not vital.
|
||||
|
||||
The general idea is to give a good visibility to an operator.
|
||||
Everything that moves should get a counter, errors should either be
|
||||
|
||||
@@ -145,3 +145,43 @@ func (r *Reporter) HealthcheckHTTPHandler() http.Handler {
|
||||
json.NewEncoder(w).Encode(results)
|
||||
})
|
||||
}
|
||||
|
||||
// ChannelHealthcheckFunc is the function sent over a channel to signal liveness
|
||||
type ChannelHealthcheckFunc func(HealthcheckStatus, string)
|
||||
|
||||
// ChannelHealthcheck implements an HealthcheckFunc using a channel to
|
||||
// verify a component liveness. The component should call the sent
|
||||
// function received over the provided channel to tell its status.
|
||||
func ChannelHealthcheck(ctx context.Context, contact chan<- ChannelHealthcheckFunc) HealthcheckFunc {
|
||||
return func(healthcheckCtx context.Context) HealthcheckResult {
|
||||
answerChan := make(chan HealthcheckResult)
|
||||
defer close(answerChan)
|
||||
|
||||
signalFunc := func(status HealthcheckStatus, reason string) {
|
||||
// The answer chan may be closed, because this
|
||||
// function was called too late.
|
||||
defer recover()
|
||||
answerChan <- HealthcheckResult{status, reason}
|
||||
}
|
||||
|
||||
// Send the signal function to contact.
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return HealthcheckResult{HealthcheckError, "dead"}
|
||||
case <-healthcheckCtx.Done():
|
||||
return HealthcheckResult{HealthcheckError, "timeout"}
|
||||
case contact <- signalFunc:
|
||||
}
|
||||
|
||||
// Wait for answer from worker
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return HealthcheckResult{HealthcheckError, "dead"}
|
||||
case <-healthcheckCtx.Done():
|
||||
return HealthcheckResult{HealthcheckError, "timeout"}
|
||||
case result := <-answerChan:
|
||||
return result
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,6 +79,24 @@ func TestHealthcheckCancelContext(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestChannelHealthcheck(t *testing.T) {
|
||||
contact := make(chan reporter.ChannelHealthcheckFunc)
|
||||
go func() {
|
||||
select {
|
||||
case f := <-contact:
|
||||
f(reporter.HealthcheckOK, "all well, thank you!")
|
||||
case <-time.After(50 * time.Millisecond):
|
||||
}
|
||||
}()
|
||||
|
||||
r := reporter.NewMock(t)
|
||||
r.RegisterHealthcheck("hc1", reporter.ChannelHealthcheck(context.Background(), contact))
|
||||
testHealthchecks(t, r, context.Background(),
|
||||
reporter.HealthcheckOK, map[string]reporter.HealthcheckResult{
|
||||
"hc1": {reporter.HealthcheckOK, "all well, thank you!"},
|
||||
})
|
||||
}
|
||||
|
||||
func TestHealthcheckHTTPHandler(t *testing.T) {
|
||||
r := reporter.NewMock(t)
|
||||
r.RegisterHealthcheck("hc1", func(ctx context.Context) reporter.HealthcheckResult {
|
||||
|
||||
15
snmp/root.go
15
snmp/root.go
@@ -6,6 +6,7 @@ package snmp
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
@@ -91,10 +92,13 @@ func (c *Component) Start() error {
|
||||
}
|
||||
|
||||
// Goroutine to refresh the cache
|
||||
healthyTicker := make(chan reporter.ChannelHealthcheckFunc)
|
||||
c.r.RegisterHealthcheck("snmp/ticker", reporter.ChannelHealthcheck(c.t.Context(nil), healthyTicker))
|
||||
c.t.Go(func() error {
|
||||
c.r.Debug().Msg("starting SNMP ticker")
|
||||
ticker := c.d.Clock.Ticker(c.config.CacheRefreshInterval)
|
||||
defer ticker.Stop()
|
||||
defer close(healthyTicker)
|
||||
for {
|
||||
select {
|
||||
case <-c.t.Dying():
|
||||
@@ -105,6 +109,10 @@ func (c *Component) Start() error {
|
||||
}
|
||||
}
|
||||
return nil
|
||||
case cb := <-healthyTicker:
|
||||
if cb != nil {
|
||||
cb(reporter.HealthcheckOK, "ok")
|
||||
}
|
||||
case <-ticker.C:
|
||||
c.sc.Expire(c.config.CacheDuration)
|
||||
if c.config.CacheRefresh > 0 {
|
||||
@@ -130,16 +138,23 @@ func (c *Component) Start() error {
|
||||
})
|
||||
|
||||
// Goroutines to poll samplers
|
||||
healthyWorkers := make(chan reporter.ChannelHealthcheckFunc)
|
||||
c.r.RegisterHealthcheck("snmp/worker", reporter.ChannelHealthcheck(c.t.Context(nil), healthyWorkers))
|
||||
for i := 0; i < c.config.Workers; i++ {
|
||||
workerIDStr := strconv.Itoa(i)
|
||||
c.t.Go(func() error {
|
||||
c.r.Debug().Str("worker", workerIDStr).Msg("starting SNMP poller")
|
||||
defer close(healthyWorkers)
|
||||
for {
|
||||
startIdle := time.Now()
|
||||
select {
|
||||
case <-c.t.Dying():
|
||||
c.r.Debug().Str("worker", workerIDStr).Msg("stopping SNMP poller")
|
||||
return nil
|
||||
case cb := <-healthyWorkers:
|
||||
if cb != nil {
|
||||
cb(reporter.HealthcheckOK, fmt.Sprintf("worker %s ok", workerIDStr))
|
||||
}
|
||||
case request := <-c.pollerChannel:
|
||||
startBusy := time.Now()
|
||||
samplerIP := request.SamplerIP
|
||||
|
||||
Reference in New Issue
Block a user