Add per-node ping monitoring with unreachable error tracking
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
54
errors.go
54
errors.go
@@ -9,8 +9,9 @@ import (
|
||||
type PortErrorType string
|
||||
|
||||
const (
|
||||
ErrorTypeStartup PortErrorType = "startup"
|
||||
ErrorTypeNew PortErrorType = "new"
|
||||
ErrorTypeStartup PortErrorType = "startup"
|
||||
ErrorTypeNew PortErrorType = "new"
|
||||
ErrorTypeUnreachable PortErrorType = "unreachable"
|
||||
)
|
||||
|
||||
type PortError struct {
|
||||
@@ -181,3 +182,52 @@ func (e *ErrorTracker) GetErrors() []*PortError {
|
||||
}
|
||||
return errors
|
||||
}
|
||||
|
||||
func (e *ErrorTracker) SetUnreachable(node *Node, ip string) {
|
||||
changed := e.setUnreachableLocked(node, ip)
|
||||
if changed {
|
||||
e.t.NotifyUpdate()
|
||||
}
|
||||
}
|
||||
|
||||
func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) bool {
|
||||
e.mu.Lock()
|
||||
defer e.mu.Unlock()
|
||||
|
||||
key := "unreachable:" + node.TypeID + ":" + ip
|
||||
if _, exists := e.errors[key]; exists {
|
||||
return false
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
e.nextID++
|
||||
e.errors[key] = &PortError{
|
||||
ID: fmt.Sprintf("err-%d", e.nextID),
|
||||
NodeTypeID: node.TypeID,
|
||||
NodeName: node.DisplayName(),
|
||||
PortName: ip,
|
||||
ErrorType: ErrorTypeUnreachable,
|
||||
FirstSeen: now,
|
||||
LastUpdated: now,
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (e *ErrorTracker) ClearUnreachable(node *Node, ip string) {
|
||||
changed := e.clearUnreachableLocked(node, ip)
|
||||
if changed {
|
||||
e.t.NotifyUpdate()
|
||||
}
|
||||
}
|
||||
|
||||
func (e *ErrorTracker) clearUnreachableLocked(node *Node, ip string) bool {
|
||||
e.mu.Lock()
|
||||
defer e.mu.Unlock()
|
||||
|
||||
key := "unreachable:" + node.TypeID + ":" + ip
|
||||
if _, exists := e.errors[key]; exists {
|
||||
delete(e.errors, key)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
10
nodes.go
10
nodes.go
@@ -252,8 +252,10 @@ func (n *Nodes) startNodePoller(nodeID int, node *Node) {
|
||||
n.nodeCancel[nodeID] = cancel
|
||||
|
||||
go func() {
|
||||
ticker := time.NewTicker(10 * time.Second)
|
||||
defer ticker.Stop()
|
||||
pollTicker := time.NewTicker(10 * time.Second)
|
||||
pingTicker := time.NewTicker(1 * time.Second)
|
||||
defer pollTicker.Stop()
|
||||
defer pingTicker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
@@ -261,8 +263,10 @@ func (n *Nodes) startNodePoller(nodeID int, node *Node) {
|
||||
return
|
||||
case <-node.pollTrigger:
|
||||
n.t.pollNode(node)
|
||||
case <-ticker.C:
|
||||
case <-pollTicker.C:
|
||||
n.t.pollNode(node)
|
||||
case <-pingTicker.C:
|
||||
n.t.pingNode(node)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
89
ping.go
Normal file
89
ping.go
Normal file
@@ -0,0 +1,89 @@
|
||||
package tendrils
|
||||
|
||||
import (
|
||||
"net"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/icmp"
|
||||
"golang.org/x/net/ipv4"
|
||||
)
|
||||
|
||||
func (t *Tendrils) pingNode(node *Node) {
|
||||
t.nodes.mu.RLock()
|
||||
var ips []string
|
||||
for _, iface := range node.Interfaces {
|
||||
for ipStr := range iface.IPs {
|
||||
ip := net.ParseIP(ipStr)
|
||||
if ip != nil && ip.To4() != nil {
|
||||
ips = append(ips, ipStr)
|
||||
}
|
||||
}
|
||||
}
|
||||
t.nodes.mu.RUnlock()
|
||||
|
||||
if len(ips) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
for _, ipStr := range ips {
|
||||
reachable := t.pingIP(ipStr)
|
||||
if reachable {
|
||||
t.errors.ClearUnreachable(node, ipStr)
|
||||
} else {
|
||||
t.errors.SetUnreachable(node, ipStr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tendrils) pingIP(ipStr string) bool {
|
||||
conn, err := icmp.ListenPacket("ip4:icmp", "0.0.0.0")
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
conn.SetDeadline(time.Now().Add(500 * time.Millisecond))
|
||||
|
||||
ip := net.ParseIP(ipStr)
|
||||
seq := uint16(time.Now().UnixNano() & 0xFFFF)
|
||||
|
||||
msg := icmp.Message{
|
||||
Type: ipv4.ICMPTypeEcho,
|
||||
Code: 0,
|
||||
Body: &icmp.Echo{
|
||||
ID: int(seq),
|
||||
Seq: 1,
|
||||
Data: []byte("tendrils"),
|
||||
},
|
||||
}
|
||||
msgBytes, err := msg.Marshal(nil)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
_, err = conn.WriteTo(msgBytes, &net.IPAddr{IP: ip})
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
buf := make([]byte, 1500)
|
||||
for {
|
||||
n, peer, err := conn.ReadFrom(buf)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
parsed, err := icmp.ParseMessage(1, buf[:n])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if parsed.Type == ipv4.ICMPTypeEchoReply {
|
||||
if ipAddr, ok := peer.(*net.IPAddr); ok {
|
||||
if ipAddr.IP.String() == ipStr {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -725,20 +725,32 @@
|
||||
nodeEl.addEventListener('click', () => scrollToNode(err.node_typeid));
|
||||
item.appendChild(nodeEl);
|
||||
|
||||
const portEl = document.createElement('div');
|
||||
portEl.className = 'error-port';
|
||||
portEl.textContent = 'Port: ' + err.port_name;
|
||||
item.appendChild(portEl);
|
||||
if (err.error_type === 'unreachable') {
|
||||
const ipEl = document.createElement('div');
|
||||
ipEl.className = 'error-port';
|
||||
ipEl.textContent = 'IP: ' + err.port_name;
|
||||
item.appendChild(ipEl);
|
||||
|
||||
const countsEl = document.createElement('div');
|
||||
countsEl.className = 'error-counts';
|
||||
countsEl.textContent = 'In: ' + err.in_errors + ' (+' + (err.in_delta || 0) + ') / Out: ' + err.out_errors + ' (+' + (err.out_delta || 0) + ')';
|
||||
item.appendChild(countsEl);
|
||||
const typeEl = document.createElement('div');
|
||||
typeEl.className = 'error-type';
|
||||
typeEl.textContent = 'Unreachable';
|
||||
item.appendChild(typeEl);
|
||||
} else {
|
||||
const portEl = document.createElement('div');
|
||||
portEl.className = 'error-port';
|
||||
portEl.textContent = 'Port: ' + err.port_name;
|
||||
item.appendChild(portEl);
|
||||
|
||||
const typeEl = document.createElement('div');
|
||||
typeEl.className = 'error-type';
|
||||
typeEl.textContent = err.error_type === 'startup' ? 'Present at startup' : 'New errors detected';
|
||||
item.appendChild(typeEl);
|
||||
const countsEl = document.createElement('div');
|
||||
countsEl.className = 'error-counts';
|
||||
countsEl.textContent = 'In: ' + err.in_errors + ' (+' + (err.in_delta || 0) + ') / Out: ' + err.out_errors + ' (+' + (err.out_delta || 0) + ')';
|
||||
item.appendChild(countsEl);
|
||||
|
||||
const typeEl = document.createElement('div');
|
||||
typeEl.className = 'error-type';
|
||||
typeEl.textContent = err.error_type === 'startup' ? 'Present at startup' : 'New errors detected';
|
||||
item.appendChild(typeEl);
|
||||
}
|
||||
|
||||
const dismissBtn = document.createElement('button');
|
||||
dismissBtn.textContent = 'Dismiss';
|
||||
|
||||
Reference in New Issue
Block a user