Files
akvorado/common/schema/definition.go
Vincent Bernat ac68c5970e inlet: split inlet into new inlet and outlet
This change split the inlet component into a simpler inlet and a new
outlet component. The new inlet component receive flows and put them in
Kafka, unparsed. The outlet component takes them from Kafka and resume
the processing from here (flow parsing, enrichment) and puts them in
ClickHouse.

The main goal is to ensure the inlet does a minimal work to not be late
when processing packets (and restart faster). It also brings some
simplification as the number of knobs to tune everything is reduced: for
inlet, we only need to tune the queue size for UDP, the number of
workers and a few Kafka parameters; for outlet, we need to tune a few
Kafka parameters, the number of workers and a few ClickHouse parameters.

The outlet component features a simple Kafka input component. The core
component becomes just a callback function. There is also a new
ClickHouse component to push data to ClickHouse using the low-level
ch-go library with batch inserts.

This processing has an impact on the internal representation of a
FlowMessage. Previously, it was tailored to dynamically build the
protobuf message to be put in Kafka. Now, it builds the batch request to
be sent to ClickHouse. This makes the FlowMessage structure hides the
content of the next batch request and therefore, it should be reused.
This also changes the way we decode flows as they don't output
FlowMessage anymore, they reuse one that is provided to each worker.

The ClickHouse tables are slightly updated. Instead of using Kafka
engine, the Null engine is used instead.

Fix #1122
2025-07-27 21:44:28 +02:00

637 lines
20 KiB
Go

// SPDX-FileCopyrightText: 2022 Free Mobile
// SPDX-License-Identifier: AGPL-3.0-only
package schema
import (
"errors"
"fmt"
"slices"
"strings"
"akvorado/common/helpers/bimap"
"github.com/bits-and-blooms/bitset"
)
// InterfaceBoundary identifies wether the interface is facing inside or outside the network.
type InterfaceBoundary uint
const (
// InterfaceBoundaryUndefined means we don't know about the interface.
InterfaceBoundaryUndefined InterfaceBoundary = iota
// InterfaceBoundaryExternal means this interface is facing outside our network
InterfaceBoundaryExternal
// InterfaceBoundaryInternal means this interface is facing inside our network
InterfaceBoundaryInternal
)
var (
interfaceBoundaryMap = bimap.New(map[InterfaceBoundary]string{
InterfaceBoundaryUndefined: "undefined",
InterfaceBoundaryExternal: "external",
InterfaceBoundaryInternal: "internal",
})
errUnknownInterfaceBoundary = errors.New("unknown interface boundary")
)
// MarshalText turns an interface boundary to text
func (ib InterfaceBoundary) MarshalText() ([]byte, error) {
got, ok := interfaceBoundaryMap.LoadValue(ib)
if ok {
return []byte(got), nil
}
return nil, errUnknownInterfaceBoundary
}
// String turns an interface boundary to string
func (ib InterfaceBoundary) String() string {
got, _ := interfaceBoundaryMap.LoadValue(ib)
return got
}
// UnmarshalText provides an interface boundary from text
func (ib *InterfaceBoundary) UnmarshalText(input []byte) error {
if len(input) == 0 {
*ib = InterfaceBoundaryUndefined
return nil
}
got, ok := interfaceBoundaryMap.LoadKey(string(input))
if ok {
*ib = got
return nil
}
return errUnknownInterfaceBoundary
}
const (
// DictionaryASNs is the name of the asns clickhouse dictionary.
DictionaryASNs string = "asns"
// DictionaryProtocols is the name of the protocols clickhouse dictionary.
DictionaryProtocols string = "protocols"
// DictionaryICMP is the name of the icmp clickhouse dictionary.
DictionaryICMP string = "icmp"
// DictionaryNetworks is the name of the networks clickhouse dictionary.
DictionaryNetworks string = "networks"
// DictionaryTCP is the name of the TCP clickhouse dictionary
DictionaryTCP string = "tcp"
// DictionaryUDP is the name of the UDP clickhouse dictionary
DictionaryUDP string = "udp"
)
// revive:disable
const (
ColumnTimeReceived ColumnKey = iota + 1
ColumnSamplingRate
ColumnEType
ColumnProto
ColumnBytes
ColumnPackets
ColumnPacketSize
ColumnPacketSizeBucket
ColumnForwardingStatus
ColumnExporterAddress
ColumnExporterName
ColumnExporterGroup
ColumnExporterRole
ColumnExporterSite
ColumnExporterRegion
ColumnExporterTenant
ColumnSrcAddr
ColumnDstAddr
ColumnSrcNetMask
ColumnDstNetMask
ColumnSrcNetPrefix
ColumnDstNetPrefix
ColumnSrcAS
ColumnDstAS
ColumnSrcVlan
ColumnDstVlan
ColumnSrcPort
ColumnDstPort
ColumnSrcNetName
ColumnDstNetName
ColumnSrcNetRole
ColumnDstNetRole
ColumnSrcNetSite
ColumnDstNetSite
ColumnSrcNetRegion
ColumnDstNetRegion
ColumnSrcNetTenant
ColumnDstNetTenant
ColumnSrcCountry
ColumnDstCountry
ColumnSrcGeoState
ColumnDstGeoState
ColumnSrcGeoCity
ColumnDstGeoCity
ColumnDstASPath
ColumnDst1stAS
ColumnDst2ndAS
ColumnDst3rdAS
ColumnDstCommunities
ColumnDstLargeCommunities
ColumnInIfName
ColumnOutIfName
ColumnInIfDescription
ColumnOutIfDescription
ColumnInIfSpeed
ColumnOutIfSpeed
ColumnInIfProvider
ColumnOutIfProvider
ColumnInIfConnectivity
ColumnOutIfConnectivity
ColumnInIfBoundary
ColumnOutIfBoundary
ColumnSrcAddrNAT
ColumnDstAddrNAT
ColumnSrcPortNAT
ColumnDstPortNAT
ColumnSrcMAC
ColumnDstMAC
ColumnIPTTL
ColumnIPTos
ColumnIPFragmentID
ColumnIPFragmentOffset
ColumnIPv6FlowLabel
ColumnTCPFlags
ColumnICMPv4
ColumnICMPv4Type
ColumnICMPv4Code
ColumnICMPv6
ColumnICMPv6Type
ColumnICMPv6Code
ColumnNextHop
ColumnMPLSLabels
ColumnMPLS1stLabel
ColumnMPLS2ndLabel
ColumnMPLS3rdLabel
ColumnMPLS4thLabel
// ColumnLast points to after the last static column, custom dictionaries
// (dynamic columns) come after ColumnLast
ColumnLast
)
const (
ColumnGroupL2 ColumnGroup = iota + 1
ColumnGroupNAT
ColumnGroupL3L4
ColumnGroupLast
)
// revive:enable
// Flows is the data schema for flows tables. Any column starting with Src/InIf
// will be duplicated as Dst/OutIf during init. That's not the case for columns
// in `PrimaryKeys'.
func flows() Schema {
return Schema{
clickhousePrimaryKeys: []ColumnKey{
ColumnTimeReceived,
ColumnExporterAddress,
ColumnEType,
ColumnProto,
ColumnInIfName,
ColumnSrcAS,
ColumnForwardingStatus,
ColumnOutIfName,
ColumnDstAS,
ColumnSamplingRate,
},
columns: []Column{
{
Key: ColumnTimeReceived,
NoDisable: true,
ClickHouseType: "DateTime",
ClickHouseCodec: "DoubleDelta, LZ4",
ConsoleNotDimension: true,
},
{Key: ColumnSamplingRate, NoDisable: true, ClickHouseType: "UInt64", ConsoleNotDimension: true},
{Key: ColumnExporterAddress, ParserType: "ip", ClickHouseType: "LowCardinality(IPv6)"},
{Key: ColumnExporterName, ParserType: "string", ClickHouseType: "LowCardinality(String)", ClickHouseNotSortingKey: true},
{Key: ColumnExporterGroup, ParserType: "string", ClickHouseType: "LowCardinality(String)", ClickHouseNotSortingKey: true},
{Key: ColumnExporterRole, ParserType: "string", ClickHouseType: "LowCardinality(String)", ClickHouseNotSortingKey: true},
{Key: ColumnExporterSite, ParserType: "string", ClickHouseType: "LowCardinality(String)", ClickHouseNotSortingKey: true},
{Key: ColumnExporterRegion, ParserType: "string", ClickHouseType: "LowCardinality(String)", ClickHouseNotSortingKey: true},
{Key: ColumnExporterTenant, ParserType: "string", ClickHouseType: "LowCardinality(String)", ClickHouseNotSortingKey: true},
{
Key: ColumnSrcAddr,
ParserType: "ip",
ClickHouseMainOnly: true,
ClickHouseType: "IPv6",
ClickHouseCodec: "ZSTD(1)",
ConsoleTruncateIP: true,
},
{
Key: ColumnSrcNetMask,
ClickHouseMainOnly: true,
ClickHouseType: "UInt8",
ConsoleNotDimension: true,
},
{
Key: ColumnSrcNetPrefix,
ClickHouseMainOnly: true,
ClickHouseType: "String",
ClickHouseMaterializedType: "LowCardinality(String)",
ClickHouseAlias: `CASE
WHEN EType = 0x800 THEN concat(replaceRegexpOne(IPv6CIDRToRange(SrcAddr, (96 + SrcNetMask)::UInt8).1::String, '^::ffff:', ''), '/', SrcNetMask::String)
WHEN EType = 0x86dd THEN concat(IPv6CIDRToRange(SrcAddr, SrcNetMask).1::String, '/', SrcNetMask::String)
ELSE ''
END`,
},
{
Key: ColumnSrcAS,
ClickHouseType: "UInt32",
ClickHouseGenerateFrom: "if(SrcAS = 0, c_SrcNetworks[asn], SrcAS)",
ClickHouseSelfGenerated: true,
},
{
Key: ColumnDstAS,
ClickHouseType: "UInt32",
ClickHouseGenerateFrom: "if(DstAS = 0, c_DstNetworks[asn], DstAS)",
ClickHouseSelfGenerated: true,
},
{
Key: ColumnSrcNetName,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_SrcNetworks[name]",
},
{
Key: ColumnDstNetName,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_DstNetworks[name]",
},
{
Key: ColumnSrcNetRole,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_SrcNetworks[role]",
},
{
Key: ColumnDstNetRole,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_DstNetworks[role]",
},
{
Key: ColumnSrcNetSite,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_SrcNetworks[site]",
},
{
Key: ColumnDstNetSite,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_DstNetworks[site]",
},
{
Key: ColumnSrcNetRegion,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_SrcNetworks[region]",
},
{
Key: ColumnDstNetRegion,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_DstNetworks[region]",
},
{
Key: ColumnSrcNetTenant,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_SrcNetworks[tenant]",
},
{
Key: ColumnDstNetTenant,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_DstNetworks[tenant]",
},
{Key: ColumnSrcVlan, ParserType: "uint", ClickHouseType: "UInt16", Disabled: true, Group: ColumnGroupL2},
{
Key: ColumnSrcCountry,
ParserType: "string",
ClickHouseType: "FixedString(2)",
ClickHouseGenerateFrom: "c_SrcNetworks[country]",
},
{
Key: ColumnDstCountry,
ParserType: "string",
ClickHouseType: "FixedString(2)",
ClickHouseGenerateFrom: "c_DstNetworks[country]",
},
{
Key: ColumnSrcGeoCity,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_SrcNetworks[city]",
},
{
Key: ColumnDstGeoCity,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_DstNetworks[city]",
},
{
Key: ColumnSrcGeoState,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_SrcNetworks[state]",
},
{
Key: ColumnDstGeoState,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseGenerateFrom: "c_DstNetworks[state]",
},
{
Key: ColumnDstASPath,
ClickHouseMainOnly: true,
ClickHouseType: "Array(UInt32)",
},
{
Key: ColumnDst1stAS,
Depends: []ColumnKey{ColumnDstASPath},
ClickHouseType: "UInt32",
ClickHouseGenerateFrom: "c_DstASPath[1]",
},
{
Key: ColumnDst2ndAS,
Depends: []ColumnKey{ColumnDstASPath},
ClickHouseType: "UInt32",
ClickHouseGenerateFrom: "c_DstASPath[2]",
},
{
Key: ColumnDst3rdAS,
Depends: []ColumnKey{ColumnDstASPath},
ClickHouseType: "UInt32",
ClickHouseGenerateFrom: "c_DstASPath[3]",
},
{
Key: ColumnDstCommunities,
ClickHouseMainOnly: true,
ClickHouseType: "Array(UInt32)",
},
{
Key: ColumnDstLargeCommunities,
ClickHouseMainOnly: true,
ClickHouseType: "Array(UInt128)",
ConsoleNotDimension: true,
},
{Key: ColumnInIfName, ParserType: "string", ClickHouseType: "LowCardinality(String)"},
{Key: ColumnInIfDescription, ParserType: "string", ClickHouseType: "LowCardinality(String)", ClickHouseNotSortingKey: true},
{Key: ColumnInIfSpeed, ParserType: "uint", ClickHouseType: "UInt32", ClickHouseNotSortingKey: true},
{Key: ColumnInIfConnectivity, ParserType: "string", ClickHouseType: "LowCardinality(String)", ClickHouseNotSortingKey: true},
{Key: ColumnInIfProvider, ParserType: "string", ClickHouseType: "LowCardinality(String)", ClickHouseNotSortingKey: true},
{
Key: ColumnInIfBoundary,
ClickHouseType: fmt.Sprintf("Enum8('undefined' = %d, 'external' = %d, 'internal' = %d)", InterfaceBoundaryUndefined, InterfaceBoundaryExternal, InterfaceBoundaryInternal),
ClickHouseNotSortingKey: true,
},
{Key: ColumnEType, ClickHouseType: "UInt32"}, // TODO: UInt16 but hard to change, primary key
{Key: ColumnProto, ClickHouseType: "UInt32"}, // TODO: UInt8 but hard to change, primary key
{Key: ColumnSrcPort, ParserType: "uint", ClickHouseType: "UInt16", ClickHouseMainOnly: true},
{
Key: ColumnBytes,
NoDisable: true,
ClickHouseType: "UInt64",
ClickHouseCodec: "T64, LZ4",
ClickHouseNotSortingKey: true,
ConsoleNotDimension: true,
},
{
Key: ColumnPackets,
NoDisable: true,
ClickHouseType: "UInt64",
ClickHouseCodec: "T64, LZ4",
ClickHouseNotSortingKey: true,
ConsoleNotDimension: true,
},
{
Key: ColumnPacketSize,
Depends: []ColumnKey{ColumnBytes, ColumnPackets},
ParserType: "uint",
ClickHouseType: "UInt64",
ClickHouseAlias: "intDiv(Bytes, Packets)",
ConsoleNotDimension: true,
},
{
Key: ColumnPacketSizeBucket,
Depends: []ColumnKey{ColumnPacketSize},
ClickHouseType: "LowCardinality(String)",
ClickHouseAlias: func() string {
boundaries := []int{
64, 128, 256, 512, 768, 1024, 1280, 1501,
2048, 3072, 4096, 8192, 10240, 16384, 32768, 65536,
}
conditions := []string{}
last := 0
for _, boundary := range boundaries {
conditions = append(conditions, fmt.Sprintf("PacketSize < %d, '%d-%d'",
boundary, last, boundary-1))
last = boundary
}
conditions = append(conditions, fmt.Sprintf("'%d-Inf'", last))
return fmt.Sprintf("multiIf(%s)", strings.Join(conditions, ", "))
}(),
},
{Key: ColumnForwardingStatus, ParserType: "uint", ClickHouseType: "UInt32"}, // TODO: UInt8 but hard to change, primary key
{
Key: ColumnSrcAddrNAT,
Disabled: true,
Group: ColumnGroupNAT,
ParserType: "ip",
ClickHouseType: "IPv6",
ClickHouseMainOnly: true,
ConsoleTruncateIP: true,
},
{
Key: ColumnSrcPortNAT,
Disabled: true,
Group: ColumnGroupNAT,
ParserType: "uint",
ClickHouseType: "UInt16",
ClickHouseMainOnly: true,
},
{Key: ColumnSrcMAC, Disabled: true, Group: ColumnGroupL2, ClickHouseType: "UInt64"},
{Key: ColumnIPTTL, Disabled: true, Group: ColumnGroupL3L4, ParserType: "uint", ClickHouseType: "UInt8"},
{Key: ColumnIPTos, Disabled: true, Group: ColumnGroupL3L4, ParserType: "uint", ClickHouseType: "UInt8"},
{Key: ColumnIPFragmentID, Disabled: true, Group: ColumnGroupL3L4, ParserType: "uint", ClickHouseType: "UInt32"},
{Key: ColumnIPFragmentOffset, Disabled: true, Group: ColumnGroupL3L4, ParserType: "uint", ClickHouseType: "UInt16"},
{Key: ColumnIPv6FlowLabel, Disabled: true, Group: ColumnGroupL3L4, ParserType: "uint", ClickHouseType: "UInt32"},
{Key: ColumnTCPFlags, Disabled: true, Group: ColumnGroupL3L4, ParserType: "uint", ClickHouseType: "UInt16"},
{Key: ColumnICMPv4Type, Disabled: true, Group: ColumnGroupL3L4, ParserType: "uint", ClickHouseType: "UInt8"},
{Key: ColumnICMPv4Code, Disabled: true, Group: ColumnGroupL3L4, ParserType: "uint", ClickHouseType: "UInt8"},
{Key: ColumnICMPv6Type, Disabled: true, Group: ColumnGroupL3L4, ParserType: "uint", ClickHouseType: "UInt8"},
{Key: ColumnICMPv6Code, Disabled: true, Group: ColumnGroupL3L4, ParserType: "uint", ClickHouseType: "UInt8"},
{
Key: ColumnICMPv4,
Depends: []ColumnKey{ColumnProto, ColumnICMPv4Type, ColumnICMPv4Code},
Disabled: true,
Group: ColumnGroupL3L4,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseAlias: `if(Proto = 1, ` +
fmt.Sprintf(`dictGetOrDefault('%s', 'name', tuple(Proto, ICMPv4Type, ICMPv4Code), `, DictionaryICMP) +
`concat(toString(ICMPv4Type), '/', toString(ICMPv4Code))), '')`,
},
{
Key: ColumnICMPv6,
Depends: []ColumnKey{ColumnProto, ColumnICMPv6Type, ColumnICMPv6Code},
Disabled: true,
Group: ColumnGroupL3L4,
ParserType: "string",
ClickHouseType: "LowCardinality(String)",
ClickHouseAlias: `if(Proto = 58, ` +
fmt.Sprintf(`dictGetOrDefault('%s', 'name', tuple(Proto, ICMPv6Type, ICMPv6Code), `, DictionaryICMP) +
`concat(toString(ICMPv6Type), '/', toString(ICMPv6Code))), '')`,
},
{
Key: ColumnNextHop,
Disabled: true,
ParserType: "ip",
ClickHouseType: "LowCardinality(IPv6)",
ClickHouseCodec: "ZSTD(1)",
},
{
Key: ColumnMPLSLabels,
Disabled: true,
ClickHouseMainOnly: true,
ClickHouseType: "Array(UInt32)",
ParserType: "array(uint)",
},
{
Key: ColumnMPLS1stLabel,
Disabled: true,
Depends: []ColumnKey{ColumnMPLSLabels},
ClickHouseMainOnly: true,
ClickHouseType: "UInt32",
ClickHouseAlias: "MPLSLabels[1]",
ParserType: "uint",
},
{
Key: ColumnMPLS2ndLabel,
Disabled: true,
Depends: []ColumnKey{ColumnMPLSLabels},
ClickHouseMainOnly: true,
ClickHouseType: "UInt32",
ClickHouseAlias: "MPLSLabels[2]",
ParserType: "uint",
},
{
Key: ColumnMPLS3rdLabel,
Disabled: true,
Depends: []ColumnKey{ColumnMPLSLabels},
ClickHouseMainOnly: true,
ClickHouseType: "UInt32",
ClickHouseAlias: "MPLSLabels[3]",
ParserType: "uint",
},
{
Key: ColumnMPLS4thLabel,
Disabled: true,
Depends: []ColumnKey{ColumnMPLSLabels},
ClickHouseMainOnly: true,
ClickHouseType: "UInt32",
ClickHouseAlias: "MPLSLabels[4]",
ParserType: "uint",
},
},
}.finalize()
}
// shouldProvideValue tells if we should send a value for this column to ClickHouse.
func (column *Column) shouldProvideValue() bool {
return (column.ClickHouseGenerateFrom == "" || column.ClickHouseSelfGenerated) &&
column.ClickHouseAlias == ""
}
func (schema Schema) finalize() Schema {
ncolumns := []Column{}
for _, column := range schema.columns {
// Add true name
name, ok := columnNameMap.LoadValue(column.Key)
if !ok {
panic(fmt.Sprintf("missing name mapping for %d", column.Key))
}
if column.Name == "" {
column.Name = name
}
// Non-main columns with an alias are NotSortingKey
if !column.ClickHouseMainOnly && column.ClickHouseAlias != "" {
column.ClickHouseNotSortingKey = true
}
// Deduplicate dependencies
slices.Sort(column.Depends)
column.Depends = slices.Compact(column.Depends)
ncolumns = append(ncolumns, column)
// Expand the schema Src → Dst and InIf → OutIf
alreadyExists := func(name string) bool {
key, _ := columnNameMap.LoadKey(name)
for _, column := range schema.columns {
if column.Key == key {
return true
}
}
return false
}
if strings.HasPrefix(column.Name, "Src") {
column.Name = fmt.Sprintf("Dst%s", column.Name[3:])
if !alreadyExists(column.Name) {
column.Key, ok = columnNameMap.LoadKey(column.Name)
if !ok {
panic(fmt.Sprintf("missing name mapping for %q", column.Name))
}
column.ClickHouseAlias = strings.ReplaceAll(column.ClickHouseAlias, "Src", "Dst")
ncolumns = append(ncolumns, column)
}
} else if strings.HasPrefix(column.Name, "InIf") {
column.Name = fmt.Sprintf("OutIf%s", column.Name[4:])
if !alreadyExists(column.Name) {
column.Key, ok = columnNameMap.LoadKey(column.Name)
if !ok {
panic(fmt.Sprintf("missing name mapping for %q", column.Name))
}
column.ClickHouseAlias = strings.ReplaceAll(column.ClickHouseAlias, "InIf", "OutIf")
ncolumns = append(ncolumns, column)
}
}
}
schema.columns = ncolumns
// Build column index
maxKey := ColumnTimeReceived
for _, column := range schema.columns {
if column.Key > maxKey {
maxKey = column.Key
}
}
schema.columnIndex = make([]*Column, maxKey+1)
for i, column := range schema.columns {
schema.columnIndex[column.Key] = &schema.columns[i]
}
// Update disabledGroups
schema.disabledGroups = *bitset.New(uint(ColumnGroupLast))
for group := ColumnGroup(0); group < ColumnGroupLast; group++ {
schema.disabledGroups.Set(uint(group))
for _, column := range schema.columns {
if !column.Disabled && column.Group == group {
schema.disabledGroups.Clear(uint(group))
}
}
}
return schema
}