Add per-node ping monitoring with unreachable error tracking

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Ian Gulliver
2026-01-25 18:56:12 -08:00
parent a94f816f3d
commit c8be46b739
4 changed files with 172 additions and 17 deletions

View File

@@ -9,8 +9,9 @@ import (
type PortErrorType string
const (
ErrorTypeStartup PortErrorType = "startup"
ErrorTypeNew PortErrorType = "new"
ErrorTypeStartup PortErrorType = "startup"
ErrorTypeNew PortErrorType = "new"
ErrorTypeUnreachable PortErrorType = "unreachable"
)
type PortError struct {
@@ -181,3 +182,52 @@ func (e *ErrorTracker) GetErrors() []*PortError {
}
return errors
}
func (e *ErrorTracker) SetUnreachable(node *Node, ip string) {
changed := e.setUnreachableLocked(node, ip)
if changed {
e.t.NotifyUpdate()
}
}
func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) bool {
e.mu.Lock()
defer e.mu.Unlock()
key := "unreachable:" + node.TypeID + ":" + ip
if _, exists := e.errors[key]; exists {
return false
}
now := time.Now()
e.nextID++
e.errors[key] = &PortError{
ID: fmt.Sprintf("err-%d", e.nextID),
NodeTypeID: node.TypeID,
NodeName: node.DisplayName(),
PortName: ip,
ErrorType: ErrorTypeUnreachable,
FirstSeen: now,
LastUpdated: now,
}
return true
}
func (e *ErrorTracker) ClearUnreachable(node *Node, ip string) {
changed := e.clearUnreachableLocked(node, ip)
if changed {
e.t.NotifyUpdate()
}
}
func (e *ErrorTracker) clearUnreachableLocked(node *Node, ip string) bool {
e.mu.Lock()
defer e.mu.Unlock()
key := "unreachable:" + node.TypeID + ":" + ip
if _, exists := e.errors[key]; exists {
delete(e.errors, key)
return true
}
return false
}

View File

@@ -252,8 +252,10 @@ func (n *Nodes) startNodePoller(nodeID int, node *Node) {
n.nodeCancel[nodeID] = cancel
go func() {
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
pollTicker := time.NewTicker(10 * time.Second)
pingTicker := time.NewTicker(1 * time.Second)
defer pollTicker.Stop()
defer pingTicker.Stop()
for {
select {
@@ -261,8 +263,10 @@ func (n *Nodes) startNodePoller(nodeID int, node *Node) {
return
case <-node.pollTrigger:
n.t.pollNode(node)
case <-ticker.C:
case <-pollTicker.C:
n.t.pollNode(node)
case <-pingTicker.C:
n.t.pingNode(node)
}
}
}()

89
ping.go Normal file
View File

@@ -0,0 +1,89 @@
package tendrils
import (
"net"
"time"
"golang.org/x/net/icmp"
"golang.org/x/net/ipv4"
)
func (t *Tendrils) pingNode(node *Node) {
t.nodes.mu.RLock()
var ips []string
for _, iface := range node.Interfaces {
for ipStr := range iface.IPs {
ip := net.ParseIP(ipStr)
if ip != nil && ip.To4() != nil {
ips = append(ips, ipStr)
}
}
}
t.nodes.mu.RUnlock()
if len(ips) == 0 {
return
}
for _, ipStr := range ips {
reachable := t.pingIP(ipStr)
if reachable {
t.errors.ClearUnreachable(node, ipStr)
} else {
t.errors.SetUnreachable(node, ipStr)
}
}
}
func (t *Tendrils) pingIP(ipStr string) bool {
conn, err := icmp.ListenPacket("ip4:icmp", "0.0.0.0")
if err != nil {
return false
}
defer conn.Close()
conn.SetDeadline(time.Now().Add(500 * time.Millisecond))
ip := net.ParseIP(ipStr)
seq := uint16(time.Now().UnixNano() & 0xFFFF)
msg := icmp.Message{
Type: ipv4.ICMPTypeEcho,
Code: 0,
Body: &icmp.Echo{
ID: int(seq),
Seq: 1,
Data: []byte("tendrils"),
},
}
msgBytes, err := msg.Marshal(nil)
if err != nil {
return false
}
_, err = conn.WriteTo(msgBytes, &net.IPAddr{IP: ip})
if err != nil {
return false
}
buf := make([]byte, 1500)
for {
n, peer, err := conn.ReadFrom(buf)
if err != nil {
return false
}
parsed, err := icmp.ParseMessage(1, buf[:n])
if err != nil {
continue
}
if parsed.Type == ipv4.ICMPTypeEchoReply {
if ipAddr, ok := peer.(*net.IPAddr); ok {
if ipAddr.IP.String() == ipStr {
return true
}
}
}
}
}

View File

@@ -725,20 +725,32 @@
nodeEl.addEventListener('click', () => scrollToNode(err.node_typeid));
item.appendChild(nodeEl);
const portEl = document.createElement('div');
portEl.className = 'error-port';
portEl.textContent = 'Port: ' + err.port_name;
item.appendChild(portEl);
if (err.error_type === 'unreachable') {
const ipEl = document.createElement('div');
ipEl.className = 'error-port';
ipEl.textContent = 'IP: ' + err.port_name;
item.appendChild(ipEl);
const countsEl = document.createElement('div');
countsEl.className = 'error-counts';
countsEl.textContent = 'In: ' + err.in_errors + ' (+' + (err.in_delta || 0) + ') / Out: ' + err.out_errors + ' (+' + (err.out_delta || 0) + ')';
item.appendChild(countsEl);
const typeEl = document.createElement('div');
typeEl.className = 'error-type';
typeEl.textContent = 'Unreachable';
item.appendChild(typeEl);
} else {
const portEl = document.createElement('div');
portEl.className = 'error-port';
portEl.textContent = 'Port: ' + err.port_name;
item.appendChild(portEl);
const typeEl = document.createElement('div');
typeEl.className = 'error-type';
typeEl.textContent = err.error_type === 'startup' ? 'Present at startup' : 'New errors detected';
item.appendChild(typeEl);
const countsEl = document.createElement('div');
countsEl.className = 'error-counts';
countsEl.textContent = 'In: ' + err.in_errors + ' (+' + (err.in_delta || 0) + ') / Out: ' + err.out_errors + ' (+' + (err.out_delta || 0) + ')';
item.appendChild(countsEl);
const typeEl = document.createElement('div');
typeEl.className = 'error-type';
typeEl.textContent = err.error_type === 'startup' ? 'Present at startup' : 'New errors detected';
item.appendChild(typeEl);
}
const dismissBtn = document.createElement('button');
dismissBtn.textContent = 'Dismiss';