提升WAF正则表达式性能(提升20%以上)

This commit is contained in:
刘祥超
2022-01-08 11:45:14 +08:00
parent 390be7f6c6
commit b8d7e3f5b4
6 changed files with 490 additions and 9 deletions

240
internal/re/regexp.go Normal file
View File

@@ -0,0 +1,240 @@
// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved.
package re
import (
"regexp"
"strings"
)
var prefixReg = regexp.MustCompile(`^\(\?(\w+)\)`) // (?x)
var prefixReg2 = regexp.MustCompile(`^\(\?([\w\s]*:)`) // (?x: ...
var braceZero = regexp.MustCompile(`^{\s*0*\s*}`) // {0}
var braceZero2 = regexp.MustCompile(`^{\s*0*\s*,`) // {0, x}
type Regexp struct {
exp string
rawRegexp *regexp.Regexp
isStrict bool
isCaseInsensitive bool
keywordsMap RuneMap
}
func MustCompile(exp string) *Regexp {
var reg = &Regexp{
exp: exp,
rawRegexp: regexp.MustCompile(exp),
}
reg.init()
return reg
}
func Compile(exp string) (*Regexp, error) {
reg, err := regexp.Compile(exp)
if err != nil {
return nil, err
}
return NewRegexp(reg), nil
}
func NewRegexp(rawRegexp *regexp.Regexp) *Regexp {
var reg = &Regexp{
exp: rawRegexp.String(),
rawRegexp: rawRegexp,
}
reg.init()
return reg
}
func (this *Regexp) init() {
if len(this.exp) == 0 {
return
}
//var keywords = []string{}
var exp = strings.TrimSpace(this.exp)
// 去掉前面的(?...)
if prefixReg.MatchString(exp) {
var matches = prefixReg.FindStringSubmatch(exp)
var modifiers = matches[1]
if strings.Contains(modifiers, "i") {
this.isCaseInsensitive = true
}
exp = exp[len(matches[0]):]
}
var keywords = this.ParseKeywords(exp)
if len(keywords) > 0 {
this.keywordsMap = NewRuneTree(keywords)
}
}
func (this *Regexp) MatchString(s string) bool {
if this.keywordsMap != nil {
var b = this.keywordsMap.Lookup(s, this.isCaseInsensitive)
if !b {
return false
}
if this.isStrict {
return true
}
}
return this.rawRegexp.MatchString(s)
}
func (this *Regexp) Match(s []byte) bool {
if this.keywordsMap != nil {
var b = this.keywordsMap.Lookup(string(s), this.isCaseInsensitive)
if !b {
return false
}
if this.isStrict {
return true
}
}
return this.rawRegexp.Match(s)
}
// ParseKeywords 提取表达式中的关键词
// TODO 支持嵌套,类似于 A(abc|bcd)
// TODO 支持 (?:xxx)
// TODO 支持 abc)(bcd)(efg)
func (this *Regexp) ParseKeywords(exp string) []string {
var keywords = []string{}
if len(exp) == 0 {
return nil
}
var runes = []rune(exp)
// (a|b|c)
reg, err := regexp.Compile(exp)
if err == nil {
var countSub = reg.NumSubexp()
if countSub == 1 {
beginIndex := this.indexOfSymbol(runes, '(')
if beginIndex >= 0 {
runes = runes[beginIndex+1:]
symbolIndex := this.indexOfSymbol(runes, ')')
if symbolIndex > 0 && this.isPlain(runes[symbolIndex+1:]) {
runes = runes[:symbolIndex]
if len(runes) == 0 {
return nil
}
}
}
}
}
var lastIndex = 0
for index, r := range runes {
if r == '|' {
if index > 0 && runes[index-1] != '\\' {
var ks = this.parseKeyword(runes[lastIndex:index])
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
}
lastIndex = index + 1
}
}
}
if lastIndex == 0 {
var ks = this.parseKeyword(runes)
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
}
} else if lastIndex > 0 {
var ks = this.parseKeyword(runes[lastIndex:])
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
}
}
return keywords
}
func (this *Regexp) parseKeyword(keyword []rune) (result []rune) {
if len(keyword) == 0 {
return
}
// remove first \b
for index, r := range keyword {
if r == '\b' {
keyword = keyword[index+1:]
break
} else if r != '\t' && r != '\r' && r != '\n' && r != ' ' {
break
}
}
if len(keyword) == 0 {
return
}
for index, r := range keyword {
if index == 0 && r == '^' {
continue
}
if r == '(' || r == ')' {
if index == 0 {
return nil
}
if keyword[index-1] != '\\' {
return nil
}
}
if r == '[' || r == '{' || r == '.' || r == '+' || r == '$' {
if index == 0 {
return nil
}
if keyword[index-1] != '\\' {
if r == '{' && (braceZero.MatchString(string(keyword[index:])) || braceZero2.MatchString(string(keyword[index:]))) { // r {0, ...}
return result[:len(result)-1]
}
return
}
}
if r == '?' || r == '*' {
if index == 0 {
return nil
}
return result[:len(result)-1]
}
if r == '\\' || r == '\b' {
// TODO 将来更精细的处理 \d, \s, \$等
break
}
result = append(result, r)
}
return
}
// 查找符号位置
func (this *Regexp) indexOfSymbol(runes []rune, symbol rune) int {
for index, c := range runes {
if c == symbol && (index == 0 || runes[index-1] != '\\') {
return index
}
}
return -1
}
// 是否可视为为普通字符
func (this *Regexp) isPlain(runes []rune) bool {
for _, r := range []rune{'|', '(', ')'} {
if this.indexOfSymbol(runes, r) >= 0 {
return false
}
}
return true
}

120
internal/re/regexp_test.go Normal file
View File

@@ -0,0 +1,120 @@
// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved.
package re_test
import (
"github.com/TeaOSLab/EdgeCommon/pkg/serverconfigs/firewallconfigs"
"github.com/TeaOSLab/EdgeNode/internal/re"
"github.com/iwind/TeaGo/assert"
"regexp"
"testing"
)
func TestRegexp(t *testing.T) {
for _, s := range []string{"(?i)(abc|efg)", "abc|efg", "abc(.+)"} {
var reg = regexp.MustCompile(s)
t.Log("===" + s + "===")
t.Log(reg.LiteralPrefix())
t.Log(reg.NumSubexp())
t.Log(reg.SubexpNames())
}
}
func TestRegexp_MatchString(t *testing.T) {
var a = assert.NewAssertion(t)
{
var r = re.MustCompile("abc")
a.IsTrue(r.MatchString("abc"))
a.IsFalse(r.MatchString("ab"))
}
{
var r = re.MustCompile("(?i)abc|def|ghi")
a.IsTrue(r.MatchString("DEF"))
a.IsFalse(r.MatchString("ab"))
}
}
func TestRegexp_Sub(t *testing.T) {
{
reg := regexp.MustCompile(`(a|b|c)(e|f|g)`)
for _, subName := range reg.SubexpNames() {
t.Log(subName)
}
}
}
func TestRegexp_ParseKeywords(t *testing.T) {
var a = assert.NewAssertion(t)
var r = re.MustCompile("")
a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)def"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)|(?:def)"), []string{}))
a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)|def"), []string{}))
a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("(?i:abc)"), []string{}))
a.IsTrue(testCompareStrings(r.ParseKeywords("\babc"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(" \babc"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("\babc\b"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("\b(abc)"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc|efg|hij"), []string{"abc", "efg", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg*|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg?|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg+|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg{2,10}|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg{0,10}|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg.+|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("A(abc|bcd)"), []string{"abc", "bcd"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("^abc"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc$"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\$"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\d"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc{0,4}"), []string{"ab"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("{0,4}"), []string{}))
a.IsTrue(testCompareStrings(r.ParseKeywords("{1,4}"), []string{}))
a.IsTrue(testCompareStrings(r.ParseKeywords("中文|北京|上海|golang"), []string{"中文", "北京", "上海", "golang"}))
}
func TestRegexp_ParseKeywords2(t *testing.T) {
var r = re.MustCompile("")
var policy = firewallconfigs.HTTPFirewallTemplate()
for _, group := range policy.Inbound.Groups {
for _, set := range group.Sets {
for _, rule := range set.Rules {
if rule.Operator == firewallconfigs.HTTPFirewallRuleOperatorMatch || rule.Operator == firewallconfigs.HTTPFirewallRuleOperatorNotMatch {
t.Log(set.Name+":", rule.Value, "=>", r.ParseKeywords(rule.Value))
}
}
}
}
}
func BenchmarkRegexp_MatchString(b *testing.B) {
var r = re.MustCompile("(?i)abc|def|ghi")
for i := 0; i < b.N; i++ {
r.MatchString("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
}
}
func BenchmarkRegexp_MatchString2(b *testing.B) {
var r = regexp.MustCompile("(?i)abc|def|ghi")
for i := 0; i < b.N; i++ {
r.MatchString("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
}
}
func testCompareStrings(s1 []string, s2 []string) bool {
if len(s1) != len(s2) {
return false
}
for index, s := range s1 {
if s != s2[index] {
return false
}
}
return true
}

74
internal/re/rune_tree.go Normal file
View File

@@ -0,0 +1,74 @@
// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved.
package re
type RuneMap map[rune]*RuneTree
func (this *RuneMap) Lookup(s string, caseInsensitive bool) bool {
return this.lookup([]rune(s), caseInsensitive, 0)
}
func (this RuneMap) lookup(runes []rune, caseInsensitive bool, depth int) bool {
if len(runes) == 0 {
return false
}
for i, r := range runes {
tree, ok := this[r]
if !ok {
if caseInsensitive {
if r >= 'a' && r <= 'z' {
r -= 32
tree, ok = this[r]
} else if r >= 'A' && r <= 'Z' {
r += 32
tree, ok = this[r]
}
}
if !ok {
if depth > 0 {
return false
}
continue
}
}
if tree.IsEnd {
return true
}
b := tree.Children.lookup(runes[i+1:], caseInsensitive, depth+1)
if b {
return true
}
}
return false
}
type RuneTree struct {
Children RuneMap
IsEnd bool
}
func NewRuneTree(list []string) RuneMap {
var rootMap = RuneMap{}
for _, s := range list {
if len(s) == 0 {
continue
}
var lastMap = rootMap
var runes = []rune(s)
for index, r := range runes {
tree, ok := lastMap[r]
if !ok {
tree = &RuneTree{
Children: RuneMap{},
}
lastMap[r] = tree
}
if index == len(runes)-1 {
tree.IsEnd = true
}
lastMap = tree.Children
}
}
return rootMap
}

View File

@@ -0,0 +1,47 @@
// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved.
package re_test
import (
"github.com/TeaOSLab/EdgeNode/internal/re"
"github.com/iwind/TeaGo/assert"
"regexp"
"testing"
)
func TestNewRuneTree(t *testing.T) {
var a = assert.NewAssertion(t)
var tree = re.NewRuneTree([]string{"abc", "abd", "def", "GHI", "中国", "@"})
a.IsTrue(tree.Lookup("ABC", true))
a.IsTrue(tree.Lookup("ABC1", true))
a.IsTrue(tree.Lookup("1ABC", true))
a.IsTrue(tree.Lookup("def", true))
a.IsTrue(tree.Lookup("ghI", true))
a.IsFalse(tree.Lookup("d ef", true))
a.IsFalse(tree.Lookup("de", true))
a.IsFalse(tree.Lookup("de f", true))
a.IsTrue(tree.Lookup("我是中国人", true))
a.IsTrue(tree.Lookup("iwind.liu@gmail.com", true))
}
func BenchmarkRuneMap_Lookup(b *testing.B) {
var tree = re.NewRuneTree([]string{"abc", "abd", "def", "ghi", "中国"})
for i := 0; i < b.N; i++ {
tree.Lookup("我来自中国", true)
}
}
func BenchmarkRuneMap_Lookup2_NOT_FOUND(b *testing.B) {
var tree = re.NewRuneTree([]string{"abc", "abd", "cde", "GHI"})
for i := 0; i < b.N; i++ {
tree.Lookup("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36", true)
}
}
func BenchmarkRune_Regexp_FOUND(b *testing.B) {
var reg = regexp.MustCompile("(?i)abc|abd|cde|GHI")
for i := 0; i < b.N; i++ {
reg.MatchString("HELLO WORLD ABC 123 456 abc HELLO WORLD HELLO WORLD ABC 123 456 abc HELLO WORLD HELLO WORLD ABC 123 456 abc HELLO WORLD")
}
}

View File

@@ -7,6 +7,7 @@ import (
"errors" "errors"
"github.com/TeaOSLab/EdgeCommon/pkg/configutils" "github.com/TeaOSLab/EdgeCommon/pkg/configutils"
"github.com/TeaOSLab/EdgeCommon/pkg/serverconfigs/filterconfigs" "github.com/TeaOSLab/EdgeCommon/pkg/serverconfigs/filterconfigs"
"github.com/TeaOSLab/EdgeNode/internal/re"
"github.com/TeaOSLab/EdgeNode/internal/remotelogs" "github.com/TeaOSLab/EdgeNode/internal/remotelogs"
"github.com/TeaOSLab/EdgeNode/internal/waf/checkpoints" "github.com/TeaOSLab/EdgeNode/internal/waf/checkpoints"
"github.com/TeaOSLab/EdgeNode/internal/waf/requests" "github.com/TeaOSLab/EdgeNode/internal/waf/requests"
@@ -44,7 +45,7 @@ type Rule struct {
ipValue net.IP ipValue net.IP
floatValue float64 floatValue float64
reg *regexp.Regexp reg *re.Regexp
} }
func NewRule() *Rule { func NewRule() *Rule {
@@ -74,7 +75,7 @@ func (this *Rule) Init() error {
v = this.unescape(v) v = this.unescape(v)
reg, err := regexp.Compile(v) reg, err := re.Compile(v)
if err != nil { if err != nil {
return err return err
} }
@@ -87,7 +88,7 @@ func (this *Rule) Init() error {
v = this.unescape(v) v = this.unescape(v)
reg, err := regexp.Compile(v) reg, err := re.Compile(v)
if err != nil { if err != nil {
return err return err
} }

View File

@@ -2,19 +2,18 @@ package utils
import ( import (
"fmt" "fmt"
"github.com/TeaOSLab/EdgeNode/internal/re"
"github.com/TeaOSLab/EdgeNode/internal/ttlcache" "github.com/TeaOSLab/EdgeNode/internal/ttlcache"
"github.com/cespare/xxhash" "github.com/cespare/xxhash"
"github.com/iwind/TeaGo/types" "github.com/iwind/TeaGo/types"
"regexp"
"strconv" "strconv"
"time" "time"
) )
//var grid = grids.NewGrid(32, grids.NewLimitCountOpt(1000_0000))
var cache = ttlcache.NewCache() var cache = ttlcache.NewCache()
// 正则表达式匹配字符串,并缓存结果 // MatchStringCache 正则表达式匹配字符串,并缓存结果
func MatchStringCache(regex *regexp.Regexp, s string) bool { func MatchStringCache(regex *re.Regexp, s string) bool {
// 如果长度超过4096大概率是不能重用的 // 如果长度超过4096大概率是不能重用的
if len(s) > 4096 { if len(s) > 4096 {
return regex.MatchString(s) return regex.MatchString(s)
@@ -35,8 +34,8 @@ func MatchStringCache(regex *regexp.Regexp, s string) bool {
return b return b
} }
// 正则表达式匹配字节slice并缓存结果 // MatchBytesCache 正则表达式匹配字节slice并缓存结果
func MatchBytesCache(regex *regexp.Regexp, byteSlice []byte) bool { func MatchBytesCache(regex *re.Regexp, byteSlice []byte) bool {
// 如果长度超过4096大概率是不能重用的 // 如果长度超过4096大概率是不能重用的
if len(byteSlice) > 4096 { if len(byteSlice) > 4096 {
return regex.Match(byteSlice) return regex.Match(byteSlice)