Track node reachability across all IPs, increase ping interval

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Ian Gulliver
2026-01-25 21:03:15 -08:00
parent 5cd5db1e4a
commit 04e22b03cb
4 changed files with 32 additions and 32 deletions

View File

@@ -206,19 +206,19 @@ func (e *ErrorTracker) GetUnreachableNodes() []string {
return nodes return nodes
} }
func (e *ErrorTracker) SetUnreachable(node *Node, ip string) bool { func (e *ErrorTracker) SetUnreachable(node *Node) bool {
changed, becameUnreachable := e.setUnreachableLocked(node, ip) changed, becameUnreachable := e.setUnreachableLocked(node)
if changed { if changed {
e.t.NotifyUpdate() e.t.NotifyUpdate()
} }
return becameUnreachable return becameUnreachable
} }
func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool, becameUnreachable bool) { func (e *ErrorTracker) setUnreachableLocked(node *Node) (changed bool, becameUnreachable bool) {
e.mu.Lock() e.mu.Lock()
defer e.mu.Unlock() defer e.mu.Unlock()
key := "unreachable:" + node.TypeID + ":" + ip key := "unreachable:" + node.TypeID
wasUnreachable := e.unreachableNodes[node.TypeID] wasUnreachable := e.unreachableNodes[node.TypeID]
e.unreachableNodes[node.TypeID] = true e.unreachableNodes[node.TypeID] = true
@@ -238,7 +238,7 @@ func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool
ID: fmt.Sprintf("err-%d", e.nextID), ID: fmt.Sprintf("err-%d", e.nextID),
NodeTypeID: node.TypeID, NodeTypeID: node.TypeID,
NodeName: node.DisplayName(), NodeName: node.DisplayName(),
PortName: ip, PortName: "",
ErrorType: ErrorTypeUnreachable, ErrorType: ErrorTypeUnreachable,
FirstSeen: now, FirstSeen: now,
LastUpdated: now, LastUpdated: now,
@@ -246,19 +246,19 @@ func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool
return true, becameUnreachable return true, becameUnreachable
} }
func (e *ErrorTracker) ClearUnreachable(node *Node, ip string) bool { func (e *ErrorTracker) ClearUnreachable(node *Node) bool {
changed, becameReachable := e.clearUnreachableLocked(node, ip) changed, becameReachable := e.clearUnreachableLocked(node)
if changed { if changed {
e.t.NotifyUpdate() e.t.NotifyUpdate()
} }
return becameReachable return becameReachable
} }
func (e *ErrorTracker) clearUnreachableLocked(node *Node, ip string) (changed bool, becameReachable bool) { func (e *ErrorTracker) clearUnreachableLocked(node *Node) (changed bool, becameReachable bool) {
e.mu.Lock() e.mu.Lock()
defer e.mu.Unlock() defer e.mu.Unlock()
key := "unreachable:" + node.TypeID + ":" + ip key := "unreachable:" + node.TypeID
delete(e.suppressedUnreachable, key) delete(e.suppressedUnreachable, key)

View File

@@ -253,7 +253,7 @@ func (n *Nodes) startNodePoller(nodeID int, node *Node) {
go func() { go func() {
pollTicker := time.NewTicker(10 * time.Second) pollTicker := time.NewTicker(10 * time.Second)
pingTicker := time.NewTicker(1 * time.Second) pingTicker := time.NewTicker(5 * time.Second)
defer pollTicker.Stop() defer pollTicker.Stop()
defer pingTicker.Stop() defer pingTicker.Stop()

27
ping.go
View File

@@ -24,7 +24,7 @@ type PingManager struct {
failures map[string]int failures map[string]int
} }
const pingFailureThreshold = 3 const pingFailureThreshold = 5
func NewPingManager() *PingManager { func NewPingManager() *PingManager {
pm := &PingManager{ pm := &PingManager{
@@ -142,6 +142,7 @@ func (t *Tendrils) pingNode(node *Node) {
t.nodes.mu.RLock() t.nodes.mu.RLock()
var ips []string var ips []string
nodeName := node.DisplayName() nodeName := node.DisplayName()
nodeID := node.TypeID
for _, iface := range node.Interfaces { for _, iface := range node.Interfaces {
for ipStr := range iface.IPs { for ipStr := range iface.IPs {
ip := net.ParseIP(ipStr) ip := net.ParseIP(ipStr)
@@ -156,24 +157,28 @@ func (t *Tendrils) pingNode(node *Node) {
return return
} }
anyReachable := false
for _, ipStr := range ips { for _, ipStr := range ips {
reachable := t.ping.Ping(ipStr, 2*time.Second) if t.ping.Ping(ipStr, 2*time.Second) {
anyReachable = true
break
}
}
t.ping.mu.Lock() t.ping.mu.Lock()
if reachable { if anyReachable {
t.ping.failures[ipStr] = 0 t.ping.failures[nodeID] = 0
t.ping.mu.Unlock() t.ping.mu.Unlock()
if t.errors.ClearUnreachable(node, ipStr) { if t.errors.ClearUnreachable(node) {
log.Printf("[ping] %s (%s) is now reachable", nodeName, ipStr) log.Printf("[ping] %s is now reachable", nodeName)
} }
} else { } else {
t.ping.failures[ipStr]++ t.ping.failures[nodeID]++
failures := t.ping.failures[ipStr] failures := t.ping.failures[nodeID]
t.ping.mu.Unlock() t.ping.mu.Unlock()
if failures >= pingFailureThreshold { if failures >= pingFailureThreshold {
if t.errors.SetUnreachable(node, ipStr) { if t.errors.SetUnreachable(node) {
log.Printf("[ping] %s (%s) is now unreachable", nodeName, ipStr) log.Printf("[ping] %s is now unreachable", nodeName)
}
} }
} }
} }

View File

@@ -910,11 +910,6 @@
item.appendChild(nodeEl); item.appendChild(nodeEl);
if (err.error_type === 'unreachable') { if (err.error_type === 'unreachable') {
const ipEl = document.createElement('div');
ipEl.className = 'error-port';
ipEl.textContent = 'IP: ' + err.port_name;
item.appendChild(ipEl);
const typeEl = document.createElement('div'); const typeEl = document.createElement('div');
typeEl.className = 'error-type'; typeEl.className = 'error-type';
typeEl.textContent = 'Unreachable'; typeEl.textContent = 'Unreachable';