2026-01-25 18:43:23 -08:00
|
|
|
package tendrils
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"fmt"
|
2026-01-31 11:39:11 -08:00
|
|
|
"log"
|
2026-01-28 22:57:13 -08:00
|
|
|
"sort"
|
2026-01-25 18:43:23 -08:00
|
|
|
"sync"
|
|
|
|
|
"time"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
const (
|
2026-01-28 21:55:33 -08:00
|
|
|
ErrorTypeNew = "new"
|
|
|
|
|
ErrorTypeUnreachable = "unreachable"
|
|
|
|
|
ErrorTypeHighUtilization = "high_utilization"
|
2026-01-31 13:01:07 -08:00
|
|
|
ErrorTypePortFlap = "port_flap"
|
|
|
|
|
ErrorTypePortDown = "port_down"
|
2026-01-25 18:43:23 -08:00
|
|
|
)
|
|
|
|
|
|
2026-01-28 21:55:33 -08:00
|
|
|
type Error struct {
|
|
|
|
|
ID string `json:"id"`
|
2026-01-28 23:06:26 -08:00
|
|
|
NodeID string `json:"node_id"`
|
2026-01-28 21:55:33 -08:00
|
|
|
NodeName string `json:"node_name"`
|
|
|
|
|
Type string `json:"type"`
|
|
|
|
|
Port string `json:"port,omitempty"`
|
|
|
|
|
InErrors uint64 `json:"in_errors,omitempty"`
|
|
|
|
|
OutErrors uint64 `json:"out_errors,omitempty"`
|
|
|
|
|
InDelta uint64 `json:"in_delta,omitempty"`
|
|
|
|
|
OutDelta uint64 `json:"out_delta,omitempty"`
|
|
|
|
|
Utilization float64 `json:"utilization,omitempty"`
|
2026-02-02 09:59:03 -08:00
|
|
|
FirstSeen time.Time `json:"first_seen"`
|
|
|
|
|
LastSeen time.Time `json:"last_seen"`
|
2026-03-05 11:39:36 -08:00
|
|
|
LastUpdated time.Time `json:"last_updated,omitzero"`
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type ErrorTracker struct {
|
2026-01-28 23:06:26 -08:00
|
|
|
mu sync.RWMutex
|
|
|
|
|
errors map[string]*Error
|
|
|
|
|
nextID int
|
|
|
|
|
t *Tendrils
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
|
2026-01-25 18:49:39 -08:00
|
|
|
func NewErrorTracker(t *Tendrils) *ErrorTracker {
|
2026-01-25 18:43:23 -08:00
|
|
|
return &ErrorTracker{
|
2026-01-28 23:06:26 -08:00
|
|
|
errors: map[string]*Error{},
|
|
|
|
|
t: t,
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-28 23:06:26 -08:00
|
|
|
func (e *ErrorTracker) AddUnreachable(node *Node) {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
key := "unreachable:" + node.ID
|
2026-02-02 09:59:03 -08:00
|
|
|
now := time.Now().UTC()
|
|
|
|
|
|
|
|
|
|
if existing, exists := e.errors[key]; exists {
|
|
|
|
|
existing.LastSeen = now
|
|
|
|
|
e.t.NotifyUpdate()
|
2026-01-25 18:43:23 -08:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-28 23:06:26 -08:00
|
|
|
e.nextID++
|
|
|
|
|
e.errors[key] = &Error{
|
2026-02-02 09:59:03 -08:00
|
|
|
ID: fmt.Sprintf("err-%d", e.nextID),
|
|
|
|
|
NodeID: node.ID,
|
|
|
|
|
NodeName: node.DisplayName(),
|
|
|
|
|
Type: ErrorTypeUnreachable,
|
|
|
|
|
FirstSeen: now,
|
|
|
|
|
LastSeen: now,
|
2026-01-25 18:49:39 -08:00
|
|
|
}
|
2026-01-28 23:06:26 -08:00
|
|
|
e.t.NotifyUpdate()
|
2026-01-25 18:49:39 -08:00
|
|
|
}
|
|
|
|
|
|
2026-01-28 23:06:26 -08:00
|
|
|
func (e *ErrorTracker) RemoveUnreachable(node *Node) {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
2026-01-26 12:16:44 -08:00
|
|
|
|
2026-01-28 23:06:26 -08:00
|
|
|
key := "unreachable:" + node.ID
|
|
|
|
|
if _, exists := e.errors[key]; exists {
|
|
|
|
|
delete(e.errors, key)
|
2026-01-26 12:16:44 -08:00
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-28 23:06:26 -08:00
|
|
|
func (e *ErrorTracker) AddPortError(node *Node, portName string, stats *InterfaceStats, inDelta, outDelta uint64) {
|
2026-01-26 12:16:44 -08:00
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
2026-01-28 23:06:26 -08:00
|
|
|
key := node.ID + ":" + portName
|
2026-02-02 09:59:03 -08:00
|
|
|
now := time.Now().UTC()
|
2026-01-28 23:06:26 -08:00
|
|
|
|
|
|
|
|
if existing, ok := e.errors[key]; ok {
|
|
|
|
|
existing.InErrors = stats.InErrors
|
|
|
|
|
existing.OutErrors = stats.OutErrors
|
|
|
|
|
existing.InDelta += inDelta
|
|
|
|
|
existing.OutDelta += outDelta
|
2026-02-02 09:59:03 -08:00
|
|
|
existing.LastSeen = now
|
2026-01-28 23:06:26 -08:00
|
|
|
existing.LastUpdated = now
|
|
|
|
|
} else {
|
|
|
|
|
e.nextID++
|
|
|
|
|
e.errors[key] = &Error{
|
|
|
|
|
ID: fmt.Sprintf("err-%d", e.nextID),
|
|
|
|
|
NodeID: node.ID,
|
|
|
|
|
NodeName: node.DisplayName(),
|
|
|
|
|
Port: portName,
|
|
|
|
|
Type: ErrorTypeNew,
|
|
|
|
|
InErrors: stats.InErrors,
|
|
|
|
|
OutErrors: stats.OutErrors,
|
|
|
|
|
InDelta: inDelta,
|
|
|
|
|
OutDelta: outDelta,
|
|
|
|
|
FirstSeen: now,
|
2026-02-02 09:59:03 -08:00
|
|
|
LastSeen: now,
|
2026-01-28 23:06:26 -08:00
|
|
|
LastUpdated: now,
|
|
|
|
|
}
|
2026-01-31 11:39:11 -08:00
|
|
|
log.Printf("[ERROR] port errors on %s %s: in=%d out=%d", node.DisplayName(), portName, inDelta, outDelta)
|
2026-01-26 12:16:44 -08:00
|
|
|
}
|
2026-01-28 23:06:26 -08:00
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
}
|
2026-01-26 12:16:44 -08:00
|
|
|
|
2026-01-28 23:06:26 -08:00
|
|
|
func (e *ErrorTracker) AddUtilizationError(node *Node, portName string, utilization float64) {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
2026-01-26 12:16:44 -08:00
|
|
|
|
2026-01-28 22:36:44 -08:00
|
|
|
key := "util:" + node.ID + ":" + portName
|
2026-02-02 09:59:03 -08:00
|
|
|
now := time.Now().UTC()
|
2026-01-26 12:16:44 -08:00
|
|
|
|
|
|
|
|
if existing, ok := e.errors[key]; ok {
|
2026-02-02 09:59:03 -08:00
|
|
|
existing.LastSeen = now
|
2026-01-26 12:16:44 -08:00
|
|
|
if utilization > existing.Utilization {
|
|
|
|
|
existing.Utilization = utilization
|
|
|
|
|
existing.LastUpdated = now
|
|
|
|
|
}
|
2026-02-02 09:59:03 -08:00
|
|
|
e.t.NotifyUpdate()
|
2026-01-28 23:06:26 -08:00
|
|
|
return
|
2026-01-26 12:16:44 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
e.nextID++
|
2026-01-28 21:55:33 -08:00
|
|
|
e.errors[key] = &Error{
|
2026-01-26 12:16:44 -08:00
|
|
|
ID: fmt.Sprintf("err-%d", e.nextID),
|
2026-01-28 23:06:26 -08:00
|
|
|
NodeID: node.ID,
|
2026-01-26 12:16:44 -08:00
|
|
|
NodeName: node.DisplayName(),
|
2026-01-28 21:55:33 -08:00
|
|
|
Port: portName,
|
|
|
|
|
Type: ErrorTypeHighUtilization,
|
2026-01-26 12:16:44 -08:00
|
|
|
Utilization: utilization,
|
|
|
|
|
FirstSeen: now,
|
2026-02-02 09:59:03 -08:00
|
|
|
LastSeen: now,
|
2026-01-26 12:16:44 -08:00
|
|
|
LastUpdated: now,
|
|
|
|
|
}
|
2026-02-01 17:05:06 -08:00
|
|
|
log.Printf("[ERROR] high utilization on %s %s: %.0f%%", node.DisplayName(), portName, utilization)
|
2026-01-28 23:06:26 -08:00
|
|
|
e.t.NotifyUpdate()
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
|
2026-02-02 09:59:03 -08:00
|
|
|
func (e *ErrorTracker) UpdateUtilizationLastSeen(node *Node, portName string, utilization float64) {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
key := "util:" + node.ID + ":" + portName
|
|
|
|
|
if existing, ok := e.errors[key]; ok {
|
|
|
|
|
now := time.Now().UTC()
|
|
|
|
|
existing.LastSeen = now
|
|
|
|
|
if utilization > existing.Utilization {
|
|
|
|
|
existing.Utilization = utilization
|
|
|
|
|
existing.LastUpdated = now
|
|
|
|
|
}
|
|
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-31 13:01:07 -08:00
|
|
|
func (e *ErrorTracker) AddPortFlap(node *Node, portName string) {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
key := "flap:" + node.ID + ":" + portName
|
2026-02-02 09:59:03 -08:00
|
|
|
now := time.Now().UTC()
|
2026-01-31 13:01:07 -08:00
|
|
|
|
|
|
|
|
if existing, ok := e.errors[key]; ok {
|
2026-02-02 09:59:03 -08:00
|
|
|
existing.LastSeen = now
|
2026-01-31 13:01:07 -08:00
|
|
|
existing.LastUpdated = now
|
|
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
e.nextID++
|
|
|
|
|
e.errors[key] = &Error{
|
2026-02-02 09:59:03 -08:00
|
|
|
ID: fmt.Sprintf("err-%d", e.nextID),
|
|
|
|
|
NodeID: node.ID,
|
|
|
|
|
NodeName: node.DisplayName(),
|
|
|
|
|
Port: portName,
|
|
|
|
|
Type: ErrorTypePortFlap,
|
|
|
|
|
FirstSeen: now,
|
|
|
|
|
LastSeen: now,
|
2026-01-31 13:01:07 -08:00
|
|
|
}
|
|
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) AddPortDown(node *Node, portName string) {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
key := "down:" + node.ID + ":" + portName
|
2026-02-02 09:59:03 -08:00
|
|
|
now := time.Now().UTC()
|
2026-01-31 13:01:07 -08:00
|
|
|
|
|
|
|
|
if existing, ok := e.errors[key]; ok {
|
2026-02-02 09:59:03 -08:00
|
|
|
existing.LastSeen = now
|
2026-01-31 13:01:07 -08:00
|
|
|
existing.LastUpdated = now
|
|
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
e.nextID++
|
|
|
|
|
e.errors[key] = &Error{
|
2026-02-02 09:59:03 -08:00
|
|
|
ID: fmt.Sprintf("err-%d", e.nextID),
|
|
|
|
|
NodeID: node.ID,
|
|
|
|
|
NodeName: node.DisplayName(),
|
|
|
|
|
Port: portName,
|
|
|
|
|
Type: ErrorTypePortDown,
|
|
|
|
|
FirstSeen: now,
|
|
|
|
|
LastSeen: now,
|
2026-01-31 13:01:07 -08:00
|
|
|
}
|
|
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-25 18:43:23 -08:00
|
|
|
func (e *ErrorTracker) ClearError(errorID string) {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
for key, err := range e.errors {
|
|
|
|
|
if err.ID == errorID {
|
|
|
|
|
delete(e.errors, key)
|
2026-01-28 23:06:26 -08:00
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
return
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) ClearAllErrors() {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
2026-01-28 23:06:26 -08:00
|
|
|
if len(e.errors) > 0 {
|
|
|
|
|
e.errors = map[string]*Error{}
|
|
|
|
|
e.t.NotifyUpdate()
|
2026-01-25 19:02:36 -08:00
|
|
|
}
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
|
2026-01-28 21:50:48 -08:00
|
|
|
func (e *ErrorTracker) GetErrors() []*Error {
|
2026-01-25 18:43:23 -08:00
|
|
|
e.mu.RLock()
|
|
|
|
|
defer e.mu.RUnlock()
|
|
|
|
|
|
2026-01-28 21:50:48 -08:00
|
|
|
errors := make([]*Error, 0, len(e.errors))
|
2026-01-25 18:43:23 -08:00
|
|
|
for _, err := range e.errors {
|
2026-01-28 21:55:33 -08:00
|
|
|
errors = append(errors, err)
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
2026-01-28 22:57:13 -08:00
|
|
|
sort.Slice(errors, func(i, j int) bool {
|
|
|
|
|
if errors[i].NodeName != errors[j].NodeName {
|
|
|
|
|
return errors[i].NodeName < errors[j].NodeName
|
|
|
|
|
}
|
|
|
|
|
return errors[i].Port < errors[j].Port
|
|
|
|
|
})
|
2026-01-25 18:43:23 -08:00
|
|
|
return errors
|
|
|
|
|
}
|