2020-10-17 21:15:31 +08:00
package tasks
import (
2021-02-24 16:20:22 +08:00
"context"
2020-10-17 21:15:31 +08:00
"crypto/tls"
"encoding/json"
2021-11-18 14:30:53 +08:00
teaconst "github.com/TeaOSLab/EdgeAPI/internal/const"
2020-10-17 21:15:31 +08:00
"github.com/TeaOSLab/EdgeAPI/internal/db/models"
"github.com/TeaOSLab/EdgeAPI/internal/errors"
2021-09-13 13:46:20 +08:00
"github.com/TeaOSLab/EdgeCommon/pkg/configutils"
2024-04-06 10:21:52 +08:00
"github.com/TeaOSLab/EdgeCommon/pkg/iputils"
2021-05-26 14:40:05 +08:00
"github.com/TeaOSLab/EdgeCommon/pkg/nodeconfigs"
2021-10-19 16:31:05 +08:00
"github.com/TeaOSLab/EdgeCommon/pkg/nodeutils"
2020-10-17 21:15:31 +08:00
"github.com/TeaOSLab/EdgeCommon/pkg/serverconfigs"
2023-04-26 10:50:29 +08:00
"github.com/iwind/TeaGo/dbs"
2020-10-17 21:15:31 +08:00
"github.com/iwind/TeaGo/lists"
2021-10-19 16:31:05 +08:00
"github.com/iwind/TeaGo/maps"
2020-10-17 21:15:31 +08:00
"github.com/iwind/TeaGo/types"
2023-05-17 18:42:21 +08:00
timeutil "github.com/iwind/TeaGo/utils/time"
2021-02-24 16:20:22 +08:00
"net"
2020-10-17 21:15:31 +08:00
"net/http"
"strconv"
"strings"
"sync"
"time"
)
type HealthCheckExecutor struct {
2022-04-23 12:32:30 +08:00
BaseTask
2020-10-17 21:15:31 +08:00
clusterId int64
}
func NewHealthCheckExecutor ( clusterId int64 ) * HealthCheckExecutor {
return & HealthCheckExecutor { clusterId : clusterId }
}
func ( this * HealthCheckExecutor ) Run ( ) ( [ ] * HealthCheckResult , error ) {
2021-01-01 23:31:30 +08:00
cluster , err := models . NewNodeClusterDAO ( ) . FindEnabledNodeCluster ( nil , this . clusterId )
2020-10-17 21:15:31 +08:00
if err != nil {
return nil , err
}
2022-11-16 14:10:03 +08:00
if cluster == nil || ! cluster . IsOn {
// 如果节点已经被删除,则不提示错误
return nil , nil
2020-10-17 21:15:31 +08:00
}
2022-03-21 21:39:36 +08:00
if ! cluster . HealthCheck . IsNotNull ( ) {
2020-10-17 21:15:31 +08:00
return nil , errors . New ( "health check config is not found" )
}
2022-04-22 21:53:38 +08:00
var healthCheckConfig = & serverconfigs . HealthCheckConfig { }
2022-03-21 21:39:36 +08:00
err = json . Unmarshal ( cluster . HealthCheck , healthCheckConfig )
2020-10-17 21:15:31 +08:00
if err != nil {
return nil , err
}
2022-04-22 21:53:38 +08:00
var results = [ ] * HealthCheckResult { }
2022-11-10 12:44:12 +08:00
// 查询集群下的节点
2022-05-25 11:44:18 +08:00
nodes , err := models . NewNodeDAO ( ) . FindAllEnabledNodesWithClusterId ( nil , this . clusterId , false )
2020-10-17 21:15:31 +08:00
if err != nil {
return nil , err
}
2022-11-10 12:44:12 +08:00
if len ( nodes ) == 0 {
return results , nil
}
2023-04-26 10:50:29 +08:00
var tx * dbs . Tx
2020-10-17 21:15:31 +08:00
for _ , node := range nodes {
2023-05-17 18:42:21 +08:00
if ! node . IsOn || node . IsBackupForCluster || node . IsBackupForGroup || ( len ( node . OfflineDay ) > 0 && node . OfflineDay < timeutil . Format ( "Ymd" ) ) {
2020-10-17 21:15:31 +08:00
continue
}
2023-04-26 10:50:29 +08:00
ipAddrs , err := models . SharedNodeIPAddressDAO . FindNodeAccessIPAddresses ( tx , int64 ( node . Id ) , nodeconfigs . NodeRoleNode )
2020-10-17 21:15:31 +08:00
if err != nil {
return nil , err
}
2023-04-26 10:50:29 +08:00
for _ , ipAddr := range ipAddrs {
var ipClusterIds = ipAddr . DecodeClusterIds ( )
if len ( ipClusterIds ) > 0 && ! lists . ContainsInt64 ( ipClusterIds , this . clusterId ) {
continue
}
2020-10-17 21:15:31 +08:00
2023-04-26 10:50:29 +08:00
// TODO 支持备用IP
var result = & HealthCheckResult {
Node : node ,
NodeAddrId : int64 ( ipAddr . Id ) ,
NodeAddr : ipAddr . Ip ,
}
results = append ( results , result )
}
2020-10-17 21:15:31 +08:00
}
// 并行检查
2022-04-22 21:53:38 +08:00
var preparedResults = [ ] * HealthCheckResult { }
2020-10-17 21:15:31 +08:00
for _ , result := range results {
if len ( result . NodeAddr ) > 0 {
preparedResults = append ( preparedResults , result )
}
}
if len ( preparedResults ) == 0 {
return results , nil
}
2022-04-22 21:53:38 +08:00
var countResults = len ( preparedResults )
var queue = make ( chan * HealthCheckResult , countResults )
2020-10-17 21:15:31 +08:00
for _ , result := range preparedResults {
queue <- result
}
2022-04-22 21:53:38 +08:00
var countTries = types . Int ( healthCheckConfig . CountTries )
2020-10-17 21:15:31 +08:00
if countTries > 10 { // 限定最多尝试10次 TODO 应该在管理界面提示用户
countTries = 10
}
if countTries < 1 {
2021-08-29 16:01:31 +08:00
countTries = 3
2020-10-17 21:15:31 +08:00
}
2022-04-22 21:53:38 +08:00
var tryDelay = 1 * time . Second
2020-10-17 21:15:31 +08:00
if healthCheckConfig . TryDelay != nil {
tryDelay = healthCheckConfig . TryDelay . Duration ( )
if tryDelay > 1 * time . Minute { // 最多不能超过1分钟 TODO 应该在管理界面提示用户
tryDelay = 1 * time . Minute
}
}
2022-04-22 21:53:38 +08:00
var concurrent = 128
2022-11-10 12:44:12 +08:00
var wg = sync . WaitGroup { }
2020-10-17 21:15:31 +08:00
wg . Add ( countResults )
2022-04-22 21:53:38 +08:00
for i := 0 ; i < concurrent ; i ++ {
go func ( ) {
2020-10-17 21:15:31 +08:00
for {
select {
case result := <- queue :
2022-04-22 21:53:38 +08:00
this . runNode ( healthCheckConfig , result , countTries , tryDelay )
2020-10-17 21:15:31 +08:00
wg . Done ( )
default :
return
}
}
2022-04-22 21:53:38 +08:00
} ( )
2020-10-17 21:15:31 +08:00
}
wg . Wait ( )
return results , nil
}
2022-04-22 21:53:38 +08:00
func ( this * HealthCheckExecutor ) runNode ( healthCheckConfig * serverconfigs . HealthCheckConfig , result * HealthCheckResult , countTries int , tryDelay time . Duration ) {
for i := 1 ; i <= countTries ; i ++ {
var before = time . Now ( )
err := this . runNodeOnce ( healthCheckConfig , result )
result . CostMs = time . Since ( before ) . Seconds ( ) * 1000
if err != nil {
result . Error = err . Error ( )
}
if result . IsOk {
break
}
if tryDelay > 0 {
time . Sleep ( tryDelay )
}
}
// 修改节点IP状态
if teaconst . IsPlus {
isChanged , err := models . SharedNodeIPAddressDAO . UpdateAddressHealthCount ( nil , result . NodeAddrId , result . IsOk , healthCheckConfig . CountUp , healthCheckConfig . CountDown , healthCheckConfig . AutoDown )
if err != nil {
2022-04-23 12:32:30 +08:00
this . logErr ( "HealthCheckExecutor" , err . Error ( ) )
2022-04-22 21:53:38 +08:00
return
}
if isChanged {
2022-11-25 15:48:57 +08:00
// 在线状态发生变化
if healthCheckConfig . AutoDown {
// 发送消息
2023-08-08 16:46:17 +08:00
var message string
2022-11-25 15:48:57 +08:00
var messageType string
var messageLevel string
if result . IsOk {
message = "健康检查成功,节点\"" + result . Node . Name + "\", IP\"" + result . NodeAddr + "\"已恢复上线"
messageType = models . MessageTypeHealthCheckNodeUp
messageLevel = models . MessageLevelSuccess
} else {
message = "健康检查失败,节点\"" + result . Node . Name + "\", IP\"" + result . NodeAddr + "\"已自动下线"
messageType = models . MessageTypeHealthCheckNodeDown
messageLevel = models . MessageLevelError
}
2022-04-22 21:53:38 +08:00
2022-11-25 15:48:57 +08:00
err = models . NewMessageDAO ( ) . CreateNodeMessage ( nil , nodeconfigs . NodeRoleNode , this . clusterId , int64 ( result . Node . Id ) , messageType , messageLevel , message , message , nil , false )
if err != nil {
this . logErr ( "HealthCheckExecutor" , err . Error ( ) )
return
}
2023-05-17 18:42:21 +08:00
// 触发节点动作
if ! result . IsOk {
err := this . fireNodeActions ( int64 ( result . Node . Id ) )
if err != nil {
this . logErr ( "HealthCheckExecutor" , err . Error ( ) )
}
return
}
2022-04-22 21:53:38 +08:00
}
// 触发阈值
err = models . SharedNodeIPAddressDAO . FireThresholds ( nil , nodeconfigs . NodeRoleNode , int64 ( result . Node . Id ) )
if err != nil {
2022-04-23 12:32:30 +08:00
this . logErr ( "HealthCheckExecutor" , err . Error ( ) )
2022-04-22 21:53:38 +08:00
return
}
}
2022-11-26 15:54:16 +08:00
// 结束处理 , 因为我们只处理IP的上下线, 不修改节点的状态
if healthCheckConfig . AutoDown {
return
}
2022-04-22 21:53:38 +08:00
}
// 修改节点状态
if healthCheckConfig . AutoDown {
isChanged , err := models . SharedNodeDAO . UpdateNodeUpCount ( nil , int64 ( result . Node . Id ) , result . IsOk , healthCheckConfig . CountUp , healthCheckConfig . CountDown )
if err != nil {
2022-04-23 12:32:30 +08:00
this . logErr ( "HealthCheckExecutor" , err . Error ( ) )
2022-04-22 21:53:38 +08:00
} else if isChanged {
// 通知恢复或下线
if result . IsOk {
2023-03-01 15:51:09 +08:00
var message = "健康检查成功,节点\"" + result . Node . Name + "\"已恢复上线"
2022-04-22 21:53:38 +08:00
err = models . NewMessageDAO ( ) . CreateNodeMessage ( nil , nodeconfigs . NodeRoleNode , this . clusterId , int64 ( result . Node . Id ) , models . MessageTypeHealthCheckNodeUp , models . MessageLevelSuccess , message , message , nil , false )
} else {
2023-03-01 15:51:09 +08:00
var message = "健康检查失败,节点\"" + result . Node . Name + "\"已自动下线"
2022-04-22 21:53:38 +08:00
err = models . NewMessageDAO ( ) . CreateNodeMessage ( nil , nodeconfigs . NodeRoleNode , this . clusterId , int64 ( result . Node . Id ) , models . MessageTypeHealthCheckNodeDown , models . MessageLevelError , message , message , nil , false )
}
2022-11-26 15:54:16 +08:00
if err != nil {
this . logErr ( "HealthCheckExecutor" , err . Error ( ) )
return
}
}
} else {
// 通知健康检查结果
var err error
if result . IsOk {
message := "节点\"" + result . Node . Name + "\"健康检查成功"
err = models . NewMessageDAO ( ) . CreateNodeMessage ( nil , nodeconfigs . NodeRoleNode , this . clusterId , int64 ( result . Node . Id ) , models . MessageTypeHealthCheckNodeUp , models . MessageLevelSuccess , message , message , nil , false )
} else {
message := "节点\"" + result . Node . Name + "\"健康检查失败"
err = models . NewMessageDAO ( ) . CreateNodeMessage ( nil , nodeconfigs . NodeRoleNode , this . clusterId , int64 ( result . Node . Id ) , models . MessageTypeHealthCheckNodeDown , models . MessageLevelError , message , message , nil , false )
}
if err != nil {
this . logErr ( "HealthCheckExecutor" , err . Error ( ) )
return
2022-04-22 21:53:38 +08:00
}
}
}
2020-10-17 21:15:31 +08:00
// 检查单个节点
2022-04-22 21:53:38 +08:00
func ( this * HealthCheckExecutor ) runNodeOnce ( healthCheckConfig * serverconfigs . HealthCheckConfig , result * HealthCheckResult ) error {
2021-06-07 10:06:16 +08:00
// 支持IPv6
2024-04-06 10:21:52 +08:00
if iputils . IsIPv6 ( result . NodeAddr ) {
2023-06-02 14:46:38 +08:00
result . NodeAddr = configutils . QuoteIP ( result . NodeAddr )
2021-06-07 10:06:16 +08:00
}
2021-10-19 16:31:05 +08:00
if len ( healthCheckConfig . URL ) == 0 {
healthCheckConfig . URL = "http://${host}/"
}
2022-04-22 21:53:38 +08:00
var url = strings . ReplaceAll ( healthCheckConfig . URL , "${host}" , result . NodeAddr )
2020-10-17 21:15:31 +08:00
req , err := http . NewRequest ( http . MethodGet , url , nil )
if err != nil {
return err
}
2024-05-15 08:05:47 +08:00
req . Close = true
2021-10-19 16:31:05 +08:00
if len ( healthCheckConfig . UserAgent ) > 0 {
req . Header . Set ( "User-Agent" , healthCheckConfig . UserAgent )
} else {
req . Header . Set ( "User-Agent" , "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36" )
}
2021-11-29 09:52:47 +08:00
key , err := nodeutils . Base64EncodeMap ( maps . Map {
2021-10-19 16:31:05 +08:00
"onlyBasicRequest" : healthCheckConfig . OnlyBasicRequest ,
2022-05-19 17:14:32 +08:00
"accessLogIsOn" : healthCheckConfig . AccessLogIsOn ,
2021-11-29 09:52:47 +08:00
} )
2021-10-19 16:31:05 +08:00
if err != nil {
return err
}
req . Header . Set ( serverconfigs . HealthCheckHeaderName , key )
2020-10-17 21:15:31 +08:00
2022-04-22 21:53:38 +08:00
var timeout = 5 * time . Second
2020-10-17 21:15:31 +08:00
if healthCheckConfig . Timeout != nil {
timeout = healthCheckConfig . Timeout . Duration ( )
}
2022-04-22 21:53:38 +08:00
var client = & http . Client {
2020-10-17 21:15:31 +08:00
Timeout : timeout ,
Transport : & http . Transport {
2021-02-24 16:20:22 +08:00
DialContext : func ( ctx context . Context , network , addr string ) ( net . Conn , error ) {
_ , port , err := net . SplitHostPort ( addr )
if err != nil {
return nil , err
}
2021-09-13 13:46:20 +08:00
return net . DialTimeout ( network , configutils . QuoteIP ( result . NodeAddr ) + ":" + port , timeout )
2021-02-24 16:20:22 +08:00
} ,
2024-05-15 08:05:47 +08:00
MaxIdleConns : 3 ,
MaxIdleConnsPerHost : 3 ,
MaxConnsPerHost : 3 ,
2022-11-26 18:11:35 +08:00
IdleConnTimeout : 10 * time . Second ,
2020-10-17 21:15:31 +08:00
ExpectContinueTimeout : 1 * time . Second ,
TLSHandshakeTimeout : 0 ,
TLSClientConfig : & tls . Config {
InsecureSkipVerify : true ,
} ,
} ,
}
defer func ( ) {
client . CloseIdleConnections ( )
} ( )
resp , err := client . Do ( req )
if err != nil {
return err
}
_ = resp . Body . Close ( )
if len ( healthCheckConfig . StatusCodes ) > 0 && ! lists . ContainsInt ( healthCheckConfig . StatusCodes , resp . StatusCode ) {
result . Error = "invalid response status code '" + strconv . Itoa ( resp . StatusCode ) + "'"
return nil
}
result . IsOk = true
return nil
}