Files
EdgeAPI/internal/tasks/node_monitor_task.go

122 lines
2.9 KiB
Go
Raw Normal View History

2020-10-25 18:26:46 +08:00
package tasks
import (
"github.com/TeaOSLab/EdgeAPI/internal/db/models"
"github.com/TeaOSLab/EdgeAPI/internal/goman"
2020-10-25 18:26:46 +08:00
"github.com/TeaOSLab/EdgeAPI/internal/utils/numberutils"
2021-08-08 10:29:48 +08:00
"github.com/TeaOSLab/EdgeCommon/pkg/nodeconfigs"
2021-01-19 12:05:35 +08:00
"github.com/TeaOSLab/EdgeCommon/pkg/systemconfigs"
2020-10-25 18:26:46 +08:00
"github.com/iwind/TeaGo/dbs"
"github.com/iwind/TeaGo/logs"
"time"
)
func init() {
dbs.OnReadyDone(func() {
var task = NewNodeMonitorTask(60)
var ticker = time.NewTicker(60 * time.Second)
goman.New(func() {
2020-10-25 18:26:46 +08:00
for range ticker.C {
err := task.loop()
if err != nil {
logs.Println("[TASK][NODE_MONITOR]" + err.Error())
}
}
})
2020-10-25 18:26:46 +08:00
})
}
2021-08-08 10:29:48 +08:00
// NodeMonitorTask 边缘节点监控任务
2020-10-25 18:26:46 +08:00
type NodeMonitorTask struct {
intervalSeconds int
inactiveMap map[int64]int // nodeId => count
2020-10-25 18:26:46 +08:00
}
func NewNodeMonitorTask(intervalSeconds int) *NodeMonitorTask {
return &NodeMonitorTask{
intervalSeconds: intervalSeconds,
inactiveMap: map[int64]int{},
2020-10-25 18:26:46 +08:00
}
}
func (this *NodeMonitorTask) Run() {
}
func (this *NodeMonitorTask) loop() error {
// 检查上次运行时间,防止重复运行
2021-01-19 12:05:35 +08:00
settingKey := systemconfigs.SettingCodeNodeMonitor + "Loop"
2020-10-25 18:26:46 +08:00
timestamp := time.Now().Unix()
c, err := models.SharedSysSettingDAO.CompareInt64Setting(nil, settingKey, timestamp-int64(this.intervalSeconds))
2020-10-25 18:26:46 +08:00
if err != nil {
return err
}
if c > 0 {
return nil
}
// 记录时间
err = models.SharedSysSettingDAO.UpdateSetting(nil, settingKey, []byte(numberutils.FormatInt64(timestamp)))
2020-10-25 18:26:46 +08:00
if err != nil {
return err
}
clusters, err := models.SharedNodeClusterDAO.FindAllEnableClusters(nil)
2020-10-25 18:26:46 +08:00
if err != nil {
return err
}
for _, cluster := range clusters {
err := this.monitorCluster(cluster)
if err != nil {
return err
}
}
return nil
}
func (this *NodeMonitorTask) monitorCluster(cluster *models.NodeCluster) error {
clusterId := int64(cluster.Id)
// 检查离线节点
inactiveNodes, err := models.SharedNodeDAO.FindAllInactiveNodesWithClusterId(nil, clusterId)
2020-10-25 18:26:46 +08:00
if err != nil {
return err
}
var nodeMap = map[int64]*models.Node{}
2020-10-25 18:26:46 +08:00
for _, node := range inactiveNodes {
var nodeId = int64(node.Id)
nodeMap[nodeId] = node
this.inactiveMap[nodeId]++
}
2020-11-16 09:20:24 +08:00
const maxInactiveTries = 5
// 处理现有的离线状态
for nodeId, count := range this.inactiveMap {
node, ok := nodeMap[nodeId]
if ok {
// 连续两次
if count >= maxInactiveTries {
this.inactiveMap[nodeId] = 0
subject := "节点\"" + node.Name + "\"已处于离线状态"
msg := "集群'" + cluster.Name + "'节点\"" + node.Name + "\"已处于离线状态,请检查节点是否异常"
err = models.SharedMessageDAO.CreateNodeMessage(nil, nodeconfigs.NodeRoleNode, clusterId, int64(node.Id), models.MessageTypeNodeInactive, models.LevelError, subject, msg, nil, false)
if err != nil {
return err
}
}
} else {
this.inactiveMap[nodeId] = 0
2020-11-16 09:20:24 +08:00
}
2020-10-25 18:26:46 +08:00
}
// 检查CPU、内存、磁盘不足节点
2020-10-25 18:26:46 +08:00
// TODO 需要实现
return nil
}