2026-01-25 18:43:23 -08:00
|
|
|
package tendrils
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"fmt"
|
|
|
|
|
"sync"
|
|
|
|
|
"time"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
type PortErrorType string
|
|
|
|
|
|
|
|
|
|
const (
|
2026-01-25 18:56:12 -08:00
|
|
|
ErrorTypeStartup PortErrorType = "startup"
|
|
|
|
|
ErrorTypeNew PortErrorType = "new"
|
|
|
|
|
ErrorTypeUnreachable PortErrorType = "unreachable"
|
2026-01-25 18:43:23 -08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
type PortError struct {
|
|
|
|
|
ID string `json:"id"`
|
|
|
|
|
NodeTypeID string `json:"node_typeid"`
|
|
|
|
|
NodeName string `json:"node_name"`
|
|
|
|
|
PortName string `json:"port_name"`
|
|
|
|
|
ErrorType PortErrorType `json:"error_type"`
|
|
|
|
|
InErrors uint64 `json:"in_errors"`
|
|
|
|
|
OutErrors uint64 `json:"out_errors"`
|
|
|
|
|
InDelta uint64 `json:"in_delta,omitempty"`
|
|
|
|
|
OutDelta uint64 `json:"out_delta,omitempty"`
|
|
|
|
|
FirstSeen time.Time `json:"first_seen"`
|
|
|
|
|
LastUpdated time.Time `json:"last_updated"`
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type portErrorBaseline struct {
|
|
|
|
|
InErrors uint64
|
|
|
|
|
OutErrors uint64
|
|
|
|
|
HasData bool
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type ErrorTracker struct {
|
2026-01-25 19:05:13 -08:00
|
|
|
mu sync.RWMutex
|
|
|
|
|
errors map[string]*PortError
|
|
|
|
|
baselines map[string]*portErrorBaseline
|
2026-01-25 19:02:36 -08:00
|
|
|
suppressedUnreachable map[string]bool
|
2026-01-25 19:05:13 -08:00
|
|
|
unreachableNodes map[string]bool
|
|
|
|
|
nextID int
|
|
|
|
|
t *Tendrils
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
|
2026-01-25 18:49:39 -08:00
|
|
|
func NewErrorTracker(t *Tendrils) *ErrorTracker {
|
2026-01-25 18:43:23 -08:00
|
|
|
return &ErrorTracker{
|
2026-01-25 19:05:13 -08:00
|
|
|
errors: map[string]*PortError{},
|
|
|
|
|
baselines: map[string]*portErrorBaseline{},
|
2026-01-25 19:02:36 -08:00
|
|
|
suppressedUnreachable: map[string]bool{},
|
2026-01-25 19:05:13 -08:00
|
|
|
unreachableNodes: map[string]bool{},
|
|
|
|
|
t: t,
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) CheckPort(node *Node, portName string, stats *InterfaceStats) {
|
|
|
|
|
if stats == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-25 18:49:39 -08:00
|
|
|
changed := e.checkPortLocked(node, portName, stats)
|
|
|
|
|
if changed {
|
|
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) checkPortLocked(node *Node, portName string, stats *InterfaceStats) bool {
|
2026-01-25 18:43:23 -08:00
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
key := node.TypeID + ":" + portName
|
|
|
|
|
baseline := e.baselines[key]
|
|
|
|
|
|
|
|
|
|
now := time.Now()
|
|
|
|
|
|
|
|
|
|
if baseline == nil || !baseline.HasData {
|
|
|
|
|
e.baselines[key] = &portErrorBaseline{
|
|
|
|
|
InErrors: stats.InErrors,
|
|
|
|
|
OutErrors: stats.OutErrors,
|
|
|
|
|
HasData: true,
|
|
|
|
|
}
|
|
|
|
|
if stats.InErrors > 0 || stats.OutErrors > 0 {
|
|
|
|
|
e.nextID++
|
|
|
|
|
e.errors[key] = &PortError{
|
|
|
|
|
ID: fmt.Sprintf("err-%d", e.nextID),
|
|
|
|
|
NodeTypeID: node.TypeID,
|
|
|
|
|
NodeName: node.DisplayName(),
|
|
|
|
|
PortName: portName,
|
|
|
|
|
ErrorType: ErrorTypeStartup,
|
|
|
|
|
InErrors: stats.InErrors,
|
|
|
|
|
OutErrors: stats.OutErrors,
|
|
|
|
|
FirstSeen: now,
|
|
|
|
|
LastUpdated: now,
|
|
|
|
|
}
|
2026-01-25 18:49:39 -08:00
|
|
|
return true
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
2026-01-25 18:49:39 -08:00
|
|
|
return false
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inDelta := uint64(0)
|
|
|
|
|
outDelta := uint64(0)
|
|
|
|
|
if stats.InErrors > baseline.InErrors {
|
|
|
|
|
inDelta = stats.InErrors - baseline.InErrors
|
|
|
|
|
}
|
|
|
|
|
if stats.OutErrors > baseline.OutErrors {
|
|
|
|
|
outDelta = stats.OutErrors - baseline.OutErrors
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-25 18:49:39 -08:00
|
|
|
changed := false
|
2026-01-25 18:43:23 -08:00
|
|
|
if inDelta > 0 || outDelta > 0 {
|
|
|
|
|
if existing, ok := e.errors[key]; ok {
|
|
|
|
|
existing.InErrors = stats.InErrors
|
|
|
|
|
existing.OutErrors = stats.OutErrors
|
|
|
|
|
existing.InDelta += inDelta
|
|
|
|
|
existing.OutDelta += outDelta
|
|
|
|
|
existing.LastUpdated = now
|
|
|
|
|
} else {
|
|
|
|
|
e.nextID++
|
|
|
|
|
e.errors[key] = &PortError{
|
|
|
|
|
ID: fmt.Sprintf("err-%d", e.nextID),
|
|
|
|
|
NodeTypeID: node.TypeID,
|
|
|
|
|
NodeName: node.DisplayName(),
|
|
|
|
|
PortName: portName,
|
|
|
|
|
ErrorType: ErrorTypeNew,
|
|
|
|
|
InErrors: stats.InErrors,
|
|
|
|
|
OutErrors: stats.OutErrors,
|
|
|
|
|
InDelta: inDelta,
|
|
|
|
|
OutDelta: outDelta,
|
|
|
|
|
FirstSeen: now,
|
|
|
|
|
LastUpdated: now,
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-01-25 18:49:39 -08:00
|
|
|
changed = true
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
e.baselines[key].InErrors = stats.InErrors
|
|
|
|
|
e.baselines[key].OutErrors = stats.OutErrors
|
2026-01-25 18:49:39 -08:00
|
|
|
|
|
|
|
|
return changed
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) ClearError(errorID string) {
|
2026-01-25 18:49:39 -08:00
|
|
|
found := e.clearErrorLocked(errorID)
|
|
|
|
|
if found {
|
|
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) clearErrorLocked(errorID string) bool {
|
2026-01-25 18:43:23 -08:00
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
for key, err := range e.errors {
|
|
|
|
|
if err.ID == errorID {
|
2026-01-25 19:02:36 -08:00
|
|
|
if err.ErrorType == ErrorTypeUnreachable {
|
|
|
|
|
e.suppressedUnreachable[key] = true
|
|
|
|
|
}
|
2026-01-25 18:43:23 -08:00
|
|
|
delete(e.errors, key)
|
2026-01-25 18:49:39 -08:00
|
|
|
return true
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
}
|
2026-01-25 18:49:39 -08:00
|
|
|
return false
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) ClearAllErrors() {
|
2026-01-25 18:49:39 -08:00
|
|
|
had := e.clearAllErrorsLocked()
|
|
|
|
|
if had {
|
|
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) clearAllErrorsLocked() bool {
|
2026-01-25 18:43:23 -08:00
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
2026-01-25 18:49:39 -08:00
|
|
|
had := len(e.errors) > 0
|
2026-01-25 19:02:36 -08:00
|
|
|
for key, err := range e.errors {
|
|
|
|
|
if err.ErrorType == ErrorTypeUnreachable {
|
|
|
|
|
e.suppressedUnreachable[key] = true
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-01-25 18:43:23 -08:00
|
|
|
e.errors = map[string]*PortError{}
|
2026-01-25 18:49:39 -08:00
|
|
|
return had
|
2026-01-25 18:43:23 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) GetErrors() []*PortError {
|
|
|
|
|
e.mu.RLock()
|
|
|
|
|
defer e.mu.RUnlock()
|
|
|
|
|
|
|
|
|
|
errors := make([]*PortError, 0, len(e.errors))
|
|
|
|
|
for _, err := range e.errors {
|
|
|
|
|
errors = append(errors, err)
|
|
|
|
|
}
|
|
|
|
|
return errors
|
|
|
|
|
}
|
2026-01-25 18:56:12 -08:00
|
|
|
|
2026-01-25 19:05:13 -08:00
|
|
|
func (e *ErrorTracker) GetUnreachableNodes() []string {
|
|
|
|
|
e.mu.RLock()
|
|
|
|
|
defer e.mu.RUnlock()
|
|
|
|
|
|
|
|
|
|
nodes := make([]string, 0, len(e.unreachableNodes))
|
|
|
|
|
for nodeTypeID := range e.unreachableNodes {
|
|
|
|
|
nodes = append(nodes, nodeTypeID)
|
|
|
|
|
}
|
|
|
|
|
return nodes
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-25 18:56:12 -08:00
|
|
|
func (e *ErrorTracker) SetUnreachable(node *Node, ip string) {
|
|
|
|
|
changed := e.setUnreachableLocked(node, ip)
|
|
|
|
|
if changed {
|
|
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) bool {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
key := "unreachable:" + node.TypeID + ":" + ip
|
2026-01-25 19:02:36 -08:00
|
|
|
|
2026-01-25 19:05:13 -08:00
|
|
|
wasUnreachable := e.unreachableNodes[node.TypeID]
|
|
|
|
|
e.unreachableNodes[node.TypeID] = true
|
|
|
|
|
|
2026-01-25 19:02:36 -08:00
|
|
|
if e.suppressedUnreachable[key] {
|
2026-01-25 19:05:13 -08:00
|
|
|
return !wasUnreachable
|
2026-01-25 19:02:36 -08:00
|
|
|
}
|
|
|
|
|
|
2026-01-25 18:56:12 -08:00
|
|
|
if _, exists := e.errors[key]; exists {
|
2026-01-25 19:05:13 -08:00
|
|
|
return !wasUnreachable
|
2026-01-25 18:56:12 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
now := time.Now()
|
|
|
|
|
e.nextID++
|
|
|
|
|
e.errors[key] = &PortError{
|
|
|
|
|
ID: fmt.Sprintf("err-%d", e.nextID),
|
|
|
|
|
NodeTypeID: node.TypeID,
|
|
|
|
|
NodeName: node.DisplayName(),
|
|
|
|
|
PortName: ip,
|
|
|
|
|
ErrorType: ErrorTypeUnreachable,
|
|
|
|
|
FirstSeen: now,
|
|
|
|
|
LastUpdated: now,
|
|
|
|
|
}
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) ClearUnreachable(node *Node, ip string) {
|
|
|
|
|
changed := e.clearUnreachableLocked(node, ip)
|
|
|
|
|
if changed {
|
|
|
|
|
e.t.NotifyUpdate()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *ErrorTracker) clearUnreachableLocked(node *Node, ip string) bool {
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
|
|
|
|
|
|
key := "unreachable:" + node.TypeID + ":" + ip
|
2026-01-25 19:02:36 -08:00
|
|
|
|
|
|
|
|
delete(e.suppressedUnreachable, key)
|
|
|
|
|
|
2026-01-25 19:05:13 -08:00
|
|
|
wasUnreachable := e.unreachableNodes[node.TypeID]
|
|
|
|
|
delete(e.unreachableNodes, node.TypeID)
|
|
|
|
|
|
2026-01-25 18:56:12 -08:00
|
|
|
if _, exists := e.errors[key]; exists {
|
|
|
|
|
delete(e.errors, key)
|
|
|
|
|
return true
|
|
|
|
|
}
|
2026-01-25 19:05:13 -08:00
|
|
|
return wasUnreachable
|
2026-01-25 18:56:12 -08:00
|
|
|
}
|