From c8be46b73958ca73c15d32a6bee18a33b1196633 Mon Sep 17 00:00:00 2001 From: Ian Gulliver Date: Sun, 25 Jan 2026 18:56:12 -0800 Subject: [PATCH] Add per-node ping monitoring with unreachable error tracking Co-Authored-By: Claude Opus 4.5 --- errors.go | 54 ++++++++++++++++++++++++++-- nodes.go | 10 ++++-- ping.go | 89 +++++++++++++++++++++++++++++++++++++++++++++++ static/index.html | 36 ++++++++++++------- 4 files changed, 172 insertions(+), 17 deletions(-) create mode 100644 ping.go diff --git a/errors.go b/errors.go index 6f609fd..d265642 100644 --- a/errors.go +++ b/errors.go @@ -9,8 +9,9 @@ import ( type PortErrorType string const ( - ErrorTypeStartup PortErrorType = "startup" - ErrorTypeNew PortErrorType = "new" + ErrorTypeStartup PortErrorType = "startup" + ErrorTypeNew PortErrorType = "new" + ErrorTypeUnreachable PortErrorType = "unreachable" ) type PortError struct { @@ -181,3 +182,52 @@ func (e *ErrorTracker) GetErrors() []*PortError { } return errors } + +func (e *ErrorTracker) SetUnreachable(node *Node, ip string) { + changed := e.setUnreachableLocked(node, ip) + if changed { + e.t.NotifyUpdate() + } +} + +func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) bool { + e.mu.Lock() + defer e.mu.Unlock() + + key := "unreachable:" + node.TypeID + ":" + ip + if _, exists := e.errors[key]; exists { + return false + } + + now := time.Now() + e.nextID++ + e.errors[key] = &PortError{ + ID: fmt.Sprintf("err-%d", e.nextID), + NodeTypeID: node.TypeID, + NodeName: node.DisplayName(), + PortName: ip, + ErrorType: ErrorTypeUnreachable, + FirstSeen: now, + LastUpdated: now, + } + return true +} + +func (e *ErrorTracker) ClearUnreachable(node *Node, ip string) { + changed := e.clearUnreachableLocked(node, ip) + if changed { + e.t.NotifyUpdate() + } +} + +func (e *ErrorTracker) clearUnreachableLocked(node *Node, ip string) bool { + e.mu.Lock() + defer e.mu.Unlock() + + key := "unreachable:" + node.TypeID + ":" + ip + if _, exists := e.errors[key]; exists { + delete(e.errors, key) + return true + } + return false +} diff --git a/nodes.go b/nodes.go index 765192b..c7d38c4 100644 --- a/nodes.go +++ b/nodes.go @@ -252,8 +252,10 @@ func (n *Nodes) startNodePoller(nodeID int, node *Node) { n.nodeCancel[nodeID] = cancel go func() { - ticker := time.NewTicker(10 * time.Second) - defer ticker.Stop() + pollTicker := time.NewTicker(10 * time.Second) + pingTicker := time.NewTicker(1 * time.Second) + defer pollTicker.Stop() + defer pingTicker.Stop() for { select { @@ -261,8 +263,10 @@ func (n *Nodes) startNodePoller(nodeID int, node *Node) { return case <-node.pollTrigger: n.t.pollNode(node) - case <-ticker.C: + case <-pollTicker.C: n.t.pollNode(node) + case <-pingTicker.C: + n.t.pingNode(node) } } }() diff --git a/ping.go b/ping.go new file mode 100644 index 0000000..2d0944c --- /dev/null +++ b/ping.go @@ -0,0 +1,89 @@ +package tendrils + +import ( + "net" + "time" + + "golang.org/x/net/icmp" + "golang.org/x/net/ipv4" +) + +func (t *Tendrils) pingNode(node *Node) { + t.nodes.mu.RLock() + var ips []string + for _, iface := range node.Interfaces { + for ipStr := range iface.IPs { + ip := net.ParseIP(ipStr) + if ip != nil && ip.To4() != nil { + ips = append(ips, ipStr) + } + } + } + t.nodes.mu.RUnlock() + + if len(ips) == 0 { + return + } + + for _, ipStr := range ips { + reachable := t.pingIP(ipStr) + if reachable { + t.errors.ClearUnreachable(node, ipStr) + } else { + t.errors.SetUnreachable(node, ipStr) + } + } +} + +func (t *Tendrils) pingIP(ipStr string) bool { + conn, err := icmp.ListenPacket("ip4:icmp", "0.0.0.0") + if err != nil { + return false + } + defer conn.Close() + + conn.SetDeadline(time.Now().Add(500 * time.Millisecond)) + + ip := net.ParseIP(ipStr) + seq := uint16(time.Now().UnixNano() & 0xFFFF) + + msg := icmp.Message{ + Type: ipv4.ICMPTypeEcho, + Code: 0, + Body: &icmp.Echo{ + ID: int(seq), + Seq: 1, + Data: []byte("tendrils"), + }, + } + msgBytes, err := msg.Marshal(nil) + if err != nil { + return false + } + + _, err = conn.WriteTo(msgBytes, &net.IPAddr{IP: ip}) + if err != nil { + return false + } + + buf := make([]byte, 1500) + for { + n, peer, err := conn.ReadFrom(buf) + if err != nil { + return false + } + + parsed, err := icmp.ParseMessage(1, buf[:n]) + if err != nil { + continue + } + + if parsed.Type == ipv4.ICMPTypeEchoReply { + if ipAddr, ok := peer.(*net.IPAddr); ok { + if ipAddr.IP.String() == ipStr { + return true + } + } + } + } +} diff --git a/static/index.html b/static/index.html index b090f70..9190431 100644 --- a/static/index.html +++ b/static/index.html @@ -725,20 +725,32 @@ nodeEl.addEventListener('click', () => scrollToNode(err.node_typeid)); item.appendChild(nodeEl); - const portEl = document.createElement('div'); - portEl.className = 'error-port'; - portEl.textContent = 'Port: ' + err.port_name; - item.appendChild(portEl); + if (err.error_type === 'unreachable') { + const ipEl = document.createElement('div'); + ipEl.className = 'error-port'; + ipEl.textContent = 'IP: ' + err.port_name; + item.appendChild(ipEl); - const countsEl = document.createElement('div'); - countsEl.className = 'error-counts'; - countsEl.textContent = 'In: ' + err.in_errors + ' (+' + (err.in_delta || 0) + ') / Out: ' + err.out_errors + ' (+' + (err.out_delta || 0) + ')'; - item.appendChild(countsEl); + const typeEl = document.createElement('div'); + typeEl.className = 'error-type'; + typeEl.textContent = 'Unreachable'; + item.appendChild(typeEl); + } else { + const portEl = document.createElement('div'); + portEl.className = 'error-port'; + portEl.textContent = 'Port: ' + err.port_name; + item.appendChild(portEl); - const typeEl = document.createElement('div'); - typeEl.className = 'error-type'; - typeEl.textContent = err.error_type === 'startup' ? 'Present at startup' : 'New errors detected'; - item.appendChild(typeEl); + const countsEl = document.createElement('div'); + countsEl.className = 'error-counts'; + countsEl.textContent = 'In: ' + err.in_errors + ' (+' + (err.in_delta || 0) + ') / Out: ' + err.out_errors + ' (+' + (err.out_delta || 0) + ')'; + item.appendChild(countsEl); + + const typeEl = document.createElement('div'); + typeEl.className = 'error-type'; + typeEl.textContent = err.error_type === 'startup' ? 'Present at startup' : 'New errors detected'; + item.appendChild(typeEl); + } const dismissBtn = document.createElement('button'); dismissBtn.textContent = 'Dismiss';