Files
akvorado/outlet/routing/provider/bioris/root.go
Vincent Bernat beb9a3f0ba build: add even more linting rules
Notably, shorten function signatures by not repeating types.
2025-11-12 22:43:12 +01:00

387 lines
11 KiB
Go

// SPDX-FileCopyrightText: 2024 Free Mobile
// SPDX-License-Identifier: AGPL-3.0-only
package bioris
import (
"context"
"crypto/tls"
"errors"
"fmt"
"math/rand/v2"
"net/netip"
"sync"
"time"
pb "github.com/bio-routing/bio-rd/cmd/ris/api"
bnet "github.com/bio-routing/bio-rd/net"
rpb "github.com/bio-routing/bio-rd/route/api"
grpc_prometheus "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus"
"github.com/osrg/gobgp/v4/pkg/packet/bgp"
"google.golang.org/grpc"
"google.golang.org/grpc/backoff"
"google.golang.org/grpc/connectivity"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/credentials/insecure"
"gopkg.in/tomb.v2"
"akvorado/common/helpers"
"akvorado/common/reporter"
"akvorado/outlet/routing/provider"
"akvorado/outlet/routing/provider/bmp"
)
var (
errNoRouter = errors.New("no router")
errNoInstance = errors.New("no RIS instance available")
errResultEmpty = errors.New("result empty")
errNoRouteFound = errors.New("no route found")
errNoPathFound = errors.New("no path found")
errInvalidNextHop = errors.New("invalid next hop")
)
// RISInstanceRuntime represents all connections to a single RIS
type RISInstanceRuntime struct {
conn *grpc.ClientConn
client pb.RoutingInformationServiceClient
config RISInstance
}
// Provider represents the BioRIS routing provider.
type Provider struct {
r *reporter.Reporter
d *Dependencies
t tomb.Tomb
config Configuration
metrics metrics
clientMetrics *grpc_prometheus.ClientMetrics
instances map[string]*RISInstanceRuntime
routers map[netip.Addr][]*RISInstanceRuntime
mu sync.RWMutex
}
// Dependencies define the dependencies of the BioRIS Provider.
type Dependencies = provider.Dependencies
var (
_ provider.Provider = &Provider{}
_ provider.Configuration = Configuration{}
)
// New creates a new BioRIS provider.
func (configuration Configuration) New(r *reporter.Reporter, dependencies Dependencies) (provider.Provider, error) {
p := Provider{
r: r,
d: &dependencies,
config: configuration,
instances: make(map[string]*RISInstanceRuntime),
routers: make(map[netip.Addr][]*RISInstanceRuntime),
}
p.clientMetrics = grpc_prometheus.NewClientMetrics()
p.initMetrics()
return &p, nil
}
// Start starts the bioris provider.
func (p *Provider) Start() error {
p.r.Info().Msg("starting BioRIS provider")
// Connect to RIS backend (done in background)
for _, config := range p.config.RISInstances {
instance, err := p.Dial(config)
if err != nil {
return fmt.Errorf("error while dialing %s: %w", config.GRPCAddr, err)
}
p.instances[config.GRPCAddr] = instance
}
refresh := func(ctx context.Context) {
ctx, cancel := context.WithDeadline(ctx, time.Now().Add(p.config.RefreshTimeout))
defer cancel()
p.Refresh(ctx)
}
refresh(context.Background())
p.d.Daemon.Track(&p.t, "outlet/bmp")
p.t.Go(func() error {
ticker := time.NewTicker(p.config.Refresh)
defer ticker.Stop()
for {
select {
case <-p.t.Dying():
return nil
case <-ticker.C:
refresh(p.t.Context(context.Background()))
}
}
})
return nil
}
// Dial dials a RIS instance.
func (p *Provider) Dial(config RISInstance) (*RISInstanceRuntime, error) {
securityOption := grpc.WithTransportCredentials(insecure.NewCredentials())
if config.GRPCSecure {
config := &tls.Config{
MinVersion: tls.VersionTLS12,
}
securityOption = grpc.WithTransportCredentials(credentials.NewTLS(config))
}
backoff := backoff.DefaultConfig
conn, err := grpc.NewClient(config.GRPCAddr, securityOption,
grpc.WithUnaryInterceptor(p.clientMetrics.UnaryClientInterceptor()),
grpc.WithStreamInterceptor(p.clientMetrics.StreamClientInterceptor()),
grpc.WithConnectParams(grpc.ConnectParams{Backoff: backoff}),
)
if err != nil {
return nil, fmt.Errorf("error while dialing RIS %s: %w", config.GRPCAddr, err)
}
client := pb.NewRoutingInformationServiceClient(conn)
if client == nil {
conn.Close()
return nil, fmt.Errorf("error while opening RIS client %s", config.GRPCAddr)
}
p.t.Go(func() error {
var state connectivity.State = -1
for {
if !conn.WaitForStateChange(p.t.Context(context.Background()), state) {
return nil
}
state = conn.GetState()
p.metrics.risUp.WithLabelValues(config.GRPCAddr).Set(func() float64 {
if state == connectivity.Ready {
return 1
}
return 0
}())
}
})
return &RISInstanceRuntime{
config: config,
client: client,
conn: conn,
}, nil
}
// Refresh retrieves the list of routers
func (p *Provider) Refresh(ctx context.Context) {
routers := make(map[netip.Addr][]*RISInstanceRuntime)
for _, config := range p.config.RISInstances {
instance := p.instances[config.GRPCAddr]
r, err := instance.client.GetRouters(ctx, &pb.GetRoutersRequest{})
if err != nil {
p.r.Err(err).Msgf("error while getting routers from %s", config.GRPCAddr)
continue
}
p.metrics.knownRouters.WithLabelValues(config.GRPCAddr).Set(0)
for _, router := range r.GetRouters() {
routerAddress, err := netip.ParseAddr(router.Address)
if err != nil {
p.r.Err(err).Msgf("error while parsing router address %s", router.Address)
continue
}
routerAddress = helpers.AddrTo6(routerAddress)
routers[routerAddress] = append(routers[routerAddress], p.instances[config.GRPCAddr])
p.metrics.knownRouters.WithLabelValues(config.GRPCAddr).Inc()
p.metrics.lpmRequestTimeouts.WithLabelValues(config.GRPCAddr, router.Address)
p.metrics.lpmRequestErrors.WithLabelValues(config.GRPCAddr, router.Address)
p.metrics.lpmRequestSuccess.WithLabelValues(config.GRPCAddr, router.Address)
p.metrics.lpmRequests.WithLabelValues(config.GRPCAddr, router.Address)
p.metrics.routerChosenAgentIDMatch.WithLabelValues(config.GRPCAddr, router.Address)
p.metrics.routerChosenFallback.WithLabelValues(config.GRPCAddr, router.Address)
}
}
p.mu.Lock()
p.routers = routers
p.mu.Unlock()
}
// Lookup does an lookup on one of the specified RIS Instances and returns the
// well known bmp lookup result. NextHopIP is ignored, but maintained for
// compatibility to the internal bmp
func (p *Provider) Lookup(ctx context.Context, ip, _, agent netip.Addr) (provider.LookupResult, error) {
p.mu.RLock()
defer p.mu.RUnlock()
lpmRes, lpmErr := p.lookupLPM(ctx, ip, agent)
if lpmErr != nil {
return bmp.LookupResult{}, lpmErr
}
r, err := p.lpmResponseToLookupResult(lpmRes)
if err != nil {
return bmp.LookupResult{}, err
}
return r, nil
}
// chooseRouter selects the router ID best suited for the given agent ip. It
// returns router ID and RIS instance.
func (p *Provider) chooseRouter(agent netip.Addr) (netip.Addr, *RISInstanceRuntime, error) {
var chosenRis *RISInstanceRuntime
chosenRouterID := netip.IPv4Unspecified()
exactMatch := false
// We try all routers
for r := range p.routers {
chosenRouterID = r
// If we find an exact match of router id and agent ip, we are done
if r == agent {
exactMatch = true
break
}
// If not, we are implicitly using the last router id we found
}
// Verify that an actual router was found
if chosenRouterID.IsUnspecified() {
return chosenRouterID, nil, errNoRouter
}
// Randomly select a ris providing the router ID we selected earlier.
// In the future, we might also want to exclude currently unavailable ris instances
chosenRis = p.routers[chosenRouterID][rand.IntN(len(p.routers[chosenRouterID]))]
if chosenRis == nil || chosenRouterID.IsUnspecified() {
return chosenRouterID, nil, errNoInstance
}
// Update metrics with the chosen router/ris combination
if exactMatch {
p.metrics.routerChosenAgentIDMatch.WithLabelValues(chosenRis.config.GRPCAddr, chosenRouterID.Unmap().String()).Inc()
} else {
p.metrics.routerChosenFallback.WithLabelValues(chosenRis.config.GRPCAddr, chosenRouterID.Unmap().String()).Inc()
}
return chosenRouterID, chosenRis, nil
}
func (p *Provider) lpmResponseToLookupResult(lpm *pb.LPMResponse) (bmp.LookupResult, error) {
var res bmp.LookupResult
res.ASN = 0
var r *rpb.Route
largestPfxLen := -1
if lpm == nil {
return res, errResultEmpty
}
// First: find longest matching prefix under all applicable routes
for _, tr := range lpm.Routes {
if int(tr.Pfx.Length) > largestPfxLen {
// We have found a better prefix, set that as the currently used one
r = tr
largestPfxLen = int(tr.Pfx.Length)
}
}
if r == nil {
return res, errNoRouteFound
}
// Assume the first path is the preferred path, we are interested only in that path
if len(r.Paths) < 1 {
return res, errNoPathFound
}
pfx := r.Paths[0]
if pfx == nil {
return res, errNoPathFound
}
if pfx.BgpPath == nil {
return res, errNoPathFound
}
res.Communities = append(res.Communities, pfx.BgpPath.Communities...)
for _, c := range pfx.BgpPath.LargeCommunities {
res.LargeCommunities = append(res.LargeCommunities,
*bgp.NewLargeCommunity(c.GetGlobalAdministrator(), c.GetDataPart1(), c.GetDataPart2()))
}
for _, asP := range pfx.BgpPath.AsPath {
for _, as := range asP.Asns {
res.ASPath = append(res.ASPath, as)
res.ASN = as
}
}
res.NetMask = uint8(r.Pfx.GetLength())
nh := pfx.BgpPath.GetNextHop()
if nh != nil {
bnh := bnet.IPFromProtoIP(nh)
nhAddr, ok := netip.AddrFromSlice(bnh.ToNetIP())
if !ok {
return res, errInvalidNextHop
}
res.NextHop = helpers.AddrTo6(nhAddr)
}
return res, nil
}
// lookupLPM does an lookupLPM GRPC call to a BioRis instance
func (p *Provider) lookupLPM(ctx context.Context, ip, agent netip.Addr) (*pb.LPMResponse, error) {
// Choose router id and ris
chosenRouterID, chosenRis, err := p.chooseRouter(agent)
if err != nil {
return nil, err
}
ipAddr, err := bnet.IPFromBytes(ip.Unmap().AsSlice())
if err != nil {
return nil, err
}
pfxLen := uint8(32)
if !ipAddr.IsIPv4() {
pfxLen = 128
}
pfx := bnet.NewPfx(ipAddr, pfxLen)
p.metrics.lpmRequests.WithLabelValues(chosenRis.config.GRPCAddr, chosenRouterID.Unmap().String()).Inc()
clientDeadline := time.Now().Add(p.config.Timeout)
ctx, cancel := context.WithDeadline(ctx, clientDeadline)
defer cancel()
var res *pb.LPMResponse
res, err = chosenRis.client.LPM(ctx, &pb.LPMRequest{
Router: chosenRouterID.Unmap().String(),
VrfId: chosenRis.config.VRFId,
Vrf: chosenRis.config.VRF,
Pfx: pfx.ToProto(),
})
if errors.Is(ctx.Err(), context.Canceled) {
p.metrics.lpmRequestTimeouts.WithLabelValues(chosenRis.config.GRPCAddr, chosenRouterID.Unmap().String()).Inc()
return nil, errors.New("lpm lookup timeout")
}
if err != nil {
p.metrics.lpmRequestErrors.WithLabelValues(chosenRis.config.GRPCAddr, chosenRouterID.Unmap().String()).Inc()
return nil, fmt.Errorf("lpm lookup failed: %w", err)
}
p.metrics.lpmRequestSuccess.WithLabelValues(chosenRis.config.GRPCAddr, chosenRouterID.Unmap().String()).Inc()
return res, nil
}
// Stop closes connection to ris
func (p *Provider) Stop() error {
defer func() {
for _, v := range p.instances {
if v.conn != nil {
v.conn.Close()
}
}
p.r.Info().Msg("BioRIS provider stopped")
}()
p.r.Info().Msg("stopping BioRIS provider")
p.t.Kill(nil)
return p.t.Wait()
}