diff --git a/internal/tasks/node_monitor_task.go b/internal/tasks/node_monitor_task.go index e35070d1..2a88ec16 100644 --- a/internal/tasks/node_monitor_task.go +++ b/internal/tasks/node_monitor_task.go @@ -8,6 +8,8 @@ import ( "github.com/TeaOSLab/EdgeCommon/pkg/systemconfigs" "github.com/iwind/TeaGo/dbs" "github.com/iwind/TeaGo/logs" + "github.com/iwind/TeaGo/types" + "strings" "time" ) @@ -30,13 +32,15 @@ func init() { type NodeMonitorTask struct { intervalSeconds int - inactiveMap map[int64]int // nodeId => count + inactiveMap map[string]int // cluster@nodeId => count + notifiedMap map[int64]int64 // nodeId => timestamp } func NewNodeMonitorTask(intervalSeconds int) *NodeMonitorTask { return &NodeMonitorTask{ intervalSeconds: intervalSeconds, - inactiveMap: map[int64]int{}, + inactiveMap: map[string]int{}, + notifiedMap: map[int64]int64{}, } } @@ -89,18 +93,25 @@ func (this *NodeMonitorTask) monitorCluster(cluster *models.NodeCluster) error { for _, node := range inactiveNodes { var nodeId = int64(node.Id) nodeMap[nodeId] = node - this.inactiveMap[nodeId]++ + this.inactiveMap[types.String(clusterId)+"@"+types.String(nodeId)]++ } const maxInactiveTries = 5 // 处理现有的离线状态 - for nodeId, count := range this.inactiveMap { + for key, count := range this.inactiveMap { + var pieces = strings.Split(key, "@") + if pieces[0] != types.String(clusterId) { + continue + } + var nodeId = types.Int64(pieces[1]) node, ok := nodeMap[nodeId] if ok { - // 连续两次 - if count >= maxInactiveTries { - this.inactiveMap[nodeId] = 0 + // 连续 N 次离线发送通知 + // 同时也要确保两次发送通知的时间不会过近 + if count >= maxInactiveTries && time.Now().Unix()-this.notifiedMap[nodeId] > 3600 { + this.inactiveMap[key] = 0 + this.notifiedMap[nodeId] = time.Now().Unix() subject := "节点\"" + node.Name + "\"已处于离线状态" msg := "集群'" + cluster.Name + "'节点\"" + node.Name + "\"已处于离线状态,请检查节点是否异常" @@ -110,7 +121,7 @@ func (this *NodeMonitorTask) monitorCluster(cluster *models.NodeCluster) error { } } } else { - this.inactiveMap[nodeId] = 0 + delete(this.inactiveMap, key) } }