Track node reachability across all IPs, increase ping interval
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
18
errors.go
18
errors.go
@@ -206,19 +206,19 @@ func (e *ErrorTracker) GetUnreachableNodes() []string {
|
||||
return nodes
|
||||
}
|
||||
|
||||
func (e *ErrorTracker) SetUnreachable(node *Node, ip string) bool {
|
||||
changed, becameUnreachable := e.setUnreachableLocked(node, ip)
|
||||
func (e *ErrorTracker) SetUnreachable(node *Node) bool {
|
||||
changed, becameUnreachable := e.setUnreachableLocked(node)
|
||||
if changed {
|
||||
e.t.NotifyUpdate()
|
||||
}
|
||||
return becameUnreachable
|
||||
}
|
||||
|
||||
func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool, becameUnreachable bool) {
|
||||
func (e *ErrorTracker) setUnreachableLocked(node *Node) (changed bool, becameUnreachable bool) {
|
||||
e.mu.Lock()
|
||||
defer e.mu.Unlock()
|
||||
|
||||
key := "unreachable:" + node.TypeID + ":" + ip
|
||||
key := "unreachable:" + node.TypeID
|
||||
|
||||
wasUnreachable := e.unreachableNodes[node.TypeID]
|
||||
e.unreachableNodes[node.TypeID] = true
|
||||
@@ -238,7 +238,7 @@ func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool
|
||||
ID: fmt.Sprintf("err-%d", e.nextID),
|
||||
NodeTypeID: node.TypeID,
|
||||
NodeName: node.DisplayName(),
|
||||
PortName: ip,
|
||||
PortName: "",
|
||||
ErrorType: ErrorTypeUnreachable,
|
||||
FirstSeen: now,
|
||||
LastUpdated: now,
|
||||
@@ -246,19 +246,19 @@ func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool
|
||||
return true, becameUnreachable
|
||||
}
|
||||
|
||||
func (e *ErrorTracker) ClearUnreachable(node *Node, ip string) bool {
|
||||
changed, becameReachable := e.clearUnreachableLocked(node, ip)
|
||||
func (e *ErrorTracker) ClearUnreachable(node *Node) bool {
|
||||
changed, becameReachable := e.clearUnreachableLocked(node)
|
||||
if changed {
|
||||
e.t.NotifyUpdate()
|
||||
}
|
||||
return becameReachable
|
||||
}
|
||||
|
||||
func (e *ErrorTracker) clearUnreachableLocked(node *Node, ip string) (changed bool, becameReachable bool) {
|
||||
func (e *ErrorTracker) clearUnreachableLocked(node *Node) (changed bool, becameReachable bool) {
|
||||
e.mu.Lock()
|
||||
defer e.mu.Unlock()
|
||||
|
||||
key := "unreachable:" + node.TypeID + ":" + ip
|
||||
key := "unreachable:" + node.TypeID
|
||||
|
||||
delete(e.suppressedUnreachable, key)
|
||||
|
||||
|
||||
2
nodes.go
2
nodes.go
@@ -253,7 +253,7 @@ func (n *Nodes) startNodePoller(nodeID int, node *Node) {
|
||||
|
||||
go func() {
|
||||
pollTicker := time.NewTicker(10 * time.Second)
|
||||
pingTicker := time.NewTicker(1 * time.Second)
|
||||
pingTicker := time.NewTicker(5 * time.Second)
|
||||
defer pollTicker.Stop()
|
||||
defer pingTicker.Stop()
|
||||
|
||||
|
||||
39
ping.go
39
ping.go
@@ -24,7 +24,7 @@ type PingManager struct {
|
||||
failures map[string]int
|
||||
}
|
||||
|
||||
const pingFailureThreshold = 3
|
||||
const pingFailureThreshold = 5
|
||||
|
||||
func NewPingManager() *PingManager {
|
||||
pm := &PingManager{
|
||||
@@ -142,6 +142,7 @@ func (t *Tendrils) pingNode(node *Node) {
|
||||
t.nodes.mu.RLock()
|
||||
var ips []string
|
||||
nodeName := node.DisplayName()
|
||||
nodeID := node.TypeID
|
||||
for _, iface := range node.Interfaces {
|
||||
for ipStr := range iface.IPs {
|
||||
ip := net.ParseIP(ipStr)
|
||||
@@ -156,24 +157,28 @@ func (t *Tendrils) pingNode(node *Node) {
|
||||
return
|
||||
}
|
||||
|
||||
anyReachable := false
|
||||
for _, ipStr := range ips {
|
||||
reachable := t.ping.Ping(ipStr, 2*time.Second)
|
||||
if t.ping.Ping(ipStr, 2*time.Second) {
|
||||
anyReachable = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
t.ping.mu.Lock()
|
||||
if reachable {
|
||||
t.ping.failures[ipStr] = 0
|
||||
t.ping.mu.Unlock()
|
||||
if t.errors.ClearUnreachable(node, ipStr) {
|
||||
log.Printf("[ping] %s (%s) is now reachable", nodeName, ipStr)
|
||||
}
|
||||
} else {
|
||||
t.ping.failures[ipStr]++
|
||||
failures := t.ping.failures[ipStr]
|
||||
t.ping.mu.Unlock()
|
||||
if failures >= pingFailureThreshold {
|
||||
if t.errors.SetUnreachable(node, ipStr) {
|
||||
log.Printf("[ping] %s (%s) is now unreachable", nodeName, ipStr)
|
||||
}
|
||||
t.ping.mu.Lock()
|
||||
if anyReachable {
|
||||
t.ping.failures[nodeID] = 0
|
||||
t.ping.mu.Unlock()
|
||||
if t.errors.ClearUnreachable(node) {
|
||||
log.Printf("[ping] %s is now reachable", nodeName)
|
||||
}
|
||||
} else {
|
||||
t.ping.failures[nodeID]++
|
||||
failures := t.ping.failures[nodeID]
|
||||
t.ping.mu.Unlock()
|
||||
if failures >= pingFailureThreshold {
|
||||
if t.errors.SetUnreachable(node) {
|
||||
log.Printf("[ping] %s is now unreachable", nodeName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -910,11 +910,6 @@
|
||||
item.appendChild(nodeEl);
|
||||
|
||||
if (err.error_type === 'unreachable') {
|
||||
const ipEl = document.createElement('div');
|
||||
ipEl.className = 'error-port';
|
||||
ipEl.textContent = 'IP: ' + err.port_name;
|
||||
item.appendChild(ipEl);
|
||||
|
||||
const typeEl = document.createElement('div');
|
||||
typeEl.className = 'error-type';
|
||||
typeEl.textContent = 'Unreachable';
|
||||
|
||||
Reference in New Issue
Block a user