Track node reachability across all IPs, increase ping interval
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
18
errors.go
18
errors.go
@@ -206,19 +206,19 @@ func (e *ErrorTracker) GetUnreachableNodes() []string {
|
|||||||
return nodes
|
return nodes
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *ErrorTracker) SetUnreachable(node *Node, ip string) bool {
|
func (e *ErrorTracker) SetUnreachable(node *Node) bool {
|
||||||
changed, becameUnreachable := e.setUnreachableLocked(node, ip)
|
changed, becameUnreachable := e.setUnreachableLocked(node)
|
||||||
if changed {
|
if changed {
|
||||||
e.t.NotifyUpdate()
|
e.t.NotifyUpdate()
|
||||||
}
|
}
|
||||||
return becameUnreachable
|
return becameUnreachable
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool, becameUnreachable bool) {
|
func (e *ErrorTracker) setUnreachableLocked(node *Node) (changed bool, becameUnreachable bool) {
|
||||||
e.mu.Lock()
|
e.mu.Lock()
|
||||||
defer e.mu.Unlock()
|
defer e.mu.Unlock()
|
||||||
|
|
||||||
key := "unreachable:" + node.TypeID + ":" + ip
|
key := "unreachable:" + node.TypeID
|
||||||
|
|
||||||
wasUnreachable := e.unreachableNodes[node.TypeID]
|
wasUnreachable := e.unreachableNodes[node.TypeID]
|
||||||
e.unreachableNodes[node.TypeID] = true
|
e.unreachableNodes[node.TypeID] = true
|
||||||
@@ -238,7 +238,7 @@ func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool
|
|||||||
ID: fmt.Sprintf("err-%d", e.nextID),
|
ID: fmt.Sprintf("err-%d", e.nextID),
|
||||||
NodeTypeID: node.TypeID,
|
NodeTypeID: node.TypeID,
|
||||||
NodeName: node.DisplayName(),
|
NodeName: node.DisplayName(),
|
||||||
PortName: ip,
|
PortName: "",
|
||||||
ErrorType: ErrorTypeUnreachable,
|
ErrorType: ErrorTypeUnreachable,
|
||||||
FirstSeen: now,
|
FirstSeen: now,
|
||||||
LastUpdated: now,
|
LastUpdated: now,
|
||||||
@@ -246,19 +246,19 @@ func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool
|
|||||||
return true, becameUnreachable
|
return true, becameUnreachable
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *ErrorTracker) ClearUnreachable(node *Node, ip string) bool {
|
func (e *ErrorTracker) ClearUnreachable(node *Node) bool {
|
||||||
changed, becameReachable := e.clearUnreachableLocked(node, ip)
|
changed, becameReachable := e.clearUnreachableLocked(node)
|
||||||
if changed {
|
if changed {
|
||||||
e.t.NotifyUpdate()
|
e.t.NotifyUpdate()
|
||||||
}
|
}
|
||||||
return becameReachable
|
return becameReachable
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *ErrorTracker) clearUnreachableLocked(node *Node, ip string) (changed bool, becameReachable bool) {
|
func (e *ErrorTracker) clearUnreachableLocked(node *Node) (changed bool, becameReachable bool) {
|
||||||
e.mu.Lock()
|
e.mu.Lock()
|
||||||
defer e.mu.Unlock()
|
defer e.mu.Unlock()
|
||||||
|
|
||||||
key := "unreachable:" + node.TypeID + ":" + ip
|
key := "unreachable:" + node.TypeID
|
||||||
|
|
||||||
delete(e.suppressedUnreachable, key)
|
delete(e.suppressedUnreachable, key)
|
||||||
|
|
||||||
|
|||||||
2
nodes.go
2
nodes.go
@@ -253,7 +253,7 @@ func (n *Nodes) startNodePoller(nodeID int, node *Node) {
|
|||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
pollTicker := time.NewTicker(10 * time.Second)
|
pollTicker := time.NewTicker(10 * time.Second)
|
||||||
pingTicker := time.NewTicker(1 * time.Second)
|
pingTicker := time.NewTicker(5 * time.Second)
|
||||||
defer pollTicker.Stop()
|
defer pollTicker.Stop()
|
||||||
defer pingTicker.Stop()
|
defer pingTicker.Stop()
|
||||||
|
|
||||||
|
|||||||
27
ping.go
27
ping.go
@@ -24,7 +24,7 @@ type PingManager struct {
|
|||||||
failures map[string]int
|
failures map[string]int
|
||||||
}
|
}
|
||||||
|
|
||||||
const pingFailureThreshold = 3
|
const pingFailureThreshold = 5
|
||||||
|
|
||||||
func NewPingManager() *PingManager {
|
func NewPingManager() *PingManager {
|
||||||
pm := &PingManager{
|
pm := &PingManager{
|
||||||
@@ -142,6 +142,7 @@ func (t *Tendrils) pingNode(node *Node) {
|
|||||||
t.nodes.mu.RLock()
|
t.nodes.mu.RLock()
|
||||||
var ips []string
|
var ips []string
|
||||||
nodeName := node.DisplayName()
|
nodeName := node.DisplayName()
|
||||||
|
nodeID := node.TypeID
|
||||||
for _, iface := range node.Interfaces {
|
for _, iface := range node.Interfaces {
|
||||||
for ipStr := range iface.IPs {
|
for ipStr := range iface.IPs {
|
||||||
ip := net.ParseIP(ipStr)
|
ip := net.ParseIP(ipStr)
|
||||||
@@ -156,24 +157,28 @@ func (t *Tendrils) pingNode(node *Node) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
anyReachable := false
|
||||||
for _, ipStr := range ips {
|
for _, ipStr := range ips {
|
||||||
reachable := t.ping.Ping(ipStr, 2*time.Second)
|
if t.ping.Ping(ipStr, 2*time.Second) {
|
||||||
|
anyReachable = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
t.ping.mu.Lock()
|
t.ping.mu.Lock()
|
||||||
if reachable {
|
if anyReachable {
|
||||||
t.ping.failures[ipStr] = 0
|
t.ping.failures[nodeID] = 0
|
||||||
t.ping.mu.Unlock()
|
t.ping.mu.Unlock()
|
||||||
if t.errors.ClearUnreachable(node, ipStr) {
|
if t.errors.ClearUnreachable(node) {
|
||||||
log.Printf("[ping] %s (%s) is now reachable", nodeName, ipStr)
|
log.Printf("[ping] %s is now reachable", nodeName)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
t.ping.failures[ipStr]++
|
t.ping.failures[nodeID]++
|
||||||
failures := t.ping.failures[ipStr]
|
failures := t.ping.failures[nodeID]
|
||||||
t.ping.mu.Unlock()
|
t.ping.mu.Unlock()
|
||||||
if failures >= pingFailureThreshold {
|
if failures >= pingFailureThreshold {
|
||||||
if t.errors.SetUnreachable(node, ipStr) {
|
if t.errors.SetUnreachable(node) {
|
||||||
log.Printf("[ping] %s (%s) is now unreachable", nodeName, ipStr)
|
log.Printf("[ping] %s is now unreachable", nodeName)
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -910,11 +910,6 @@
|
|||||||
item.appendChild(nodeEl);
|
item.appendChild(nodeEl);
|
||||||
|
|
||||||
if (err.error_type === 'unreachable') {
|
if (err.error_type === 'unreachable') {
|
||||||
const ipEl = document.createElement('div');
|
|
||||||
ipEl.className = 'error-port';
|
|
||||||
ipEl.textContent = 'IP: ' + err.port_name;
|
|
||||||
item.appendChild(ipEl);
|
|
||||||
|
|
||||||
const typeEl = document.createElement('div');
|
const typeEl = document.createElement('div');
|
||||||
typeEl.className = 'error-type';
|
typeEl.className = 'error-type';
|
||||||
typeEl.textContent = 'Unreachable';
|
typeEl.textContent = 'Unreachable';
|
||||||
|
|||||||
Reference in New Issue
Block a user