diff --git a/internal/re/regexp.go b/internal/re/regexp.go new file mode 100644 index 0000000..2762194 --- /dev/null +++ b/internal/re/regexp.go @@ -0,0 +1,240 @@ +// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved. + +package re + +import ( + "regexp" + "strings" +) + +var prefixReg = regexp.MustCompile(`^\(\?(\w+)\)`) // (?x) +var prefixReg2 = regexp.MustCompile(`^\(\?([\w\s]*:)`) // (?x: ... +var braceZero = regexp.MustCompile(`^{\s*0*\s*}`) // {0} +var braceZero2 = regexp.MustCompile(`^{\s*0*\s*,`) // {0, x} + +type Regexp struct { + exp string + rawRegexp *regexp.Regexp + + isStrict bool + isCaseInsensitive bool + keywordsMap RuneMap +} + +func MustCompile(exp string) *Regexp { + var reg = &Regexp{ + exp: exp, + rawRegexp: regexp.MustCompile(exp), + } + reg.init() + return reg +} + +func Compile(exp string) (*Regexp, error) { + reg, err := regexp.Compile(exp) + if err != nil { + return nil, err + } + return NewRegexp(reg), nil +} + +func NewRegexp(rawRegexp *regexp.Regexp) *Regexp { + var reg = &Regexp{ + exp: rawRegexp.String(), + rawRegexp: rawRegexp, + } + reg.init() + return reg +} + +func (this *Regexp) init() { + if len(this.exp) == 0 { + return + } + + //var keywords = []string{} + + var exp = strings.TrimSpace(this.exp) + + // 去掉前面的(?...) + if prefixReg.MatchString(exp) { + var matches = prefixReg.FindStringSubmatch(exp) + var modifiers = matches[1] + if strings.Contains(modifiers, "i") { + this.isCaseInsensitive = true + } + exp = exp[len(matches[0]):] + } + + var keywords = this.ParseKeywords(exp) + if len(keywords) > 0 { + this.keywordsMap = NewRuneTree(keywords) + } +} + +func (this *Regexp) MatchString(s string) bool { + if this.keywordsMap != nil { + var b = this.keywordsMap.Lookup(s, this.isCaseInsensitive) + if !b { + return false + } + if this.isStrict { + return true + } + } + return this.rawRegexp.MatchString(s) +} + +func (this *Regexp) Match(s []byte) bool { + if this.keywordsMap != nil { + var b = this.keywordsMap.Lookup(string(s), this.isCaseInsensitive) + if !b { + return false + } + if this.isStrict { + return true + } + } + return this.rawRegexp.Match(s) +} + +// ParseKeywords 提取表达式中的关键词 +// TODO 支持嵌套,类似于 A(abc|bcd) +// TODO 支持 (?:xxx) +// TODO 支持 (abc)(bcd)(efg) +func (this *Regexp) ParseKeywords(exp string) []string { + var keywords = []string{} + if len(exp) == 0 { + return nil + } + + var runes = []rune(exp) + + // (a|b|c) + reg, err := regexp.Compile(exp) + if err == nil { + var countSub = reg.NumSubexp() + if countSub == 1 { + beginIndex := this.indexOfSymbol(runes, '(') + if beginIndex >= 0 { + runes = runes[beginIndex+1:] + symbolIndex := this.indexOfSymbol(runes, ')') + if symbolIndex > 0 && this.isPlain(runes[symbolIndex+1:]) { + runes = runes[:symbolIndex] + if len(runes) == 0 { + return nil + } + } + } + } + } + + var lastIndex = 0 + for index, r := range runes { + if r == '|' { + if index > 0 && runes[index-1] != '\\' { + var ks = this.parseKeyword(runes[lastIndex:index]) + if len(ks) > 0 { + keywords = append(keywords, string(ks)) + } else { + return nil + } + lastIndex = index + 1 + } + } + } + if lastIndex == 0 { + var ks = this.parseKeyword(runes) + if len(ks) > 0 { + keywords = append(keywords, string(ks)) + } else { + return nil + } + } else if lastIndex > 0 { + var ks = this.parseKeyword(runes[lastIndex:]) + if len(ks) > 0 { + keywords = append(keywords, string(ks)) + } else { + return nil + } + } + return keywords +} + +func (this *Regexp) parseKeyword(keyword []rune) (result []rune) { + if len(keyword) == 0 { + return + } + + // remove first \b + for index, r := range keyword { + if r == '\b' { + keyword = keyword[index+1:] + break + } else if r != '\t' && r != '\r' && r != '\n' && r != ' ' { + break + } + } + if len(keyword) == 0 { + return + } + + for index, r := range keyword { + if index == 0 && r == '^' { + continue + } + if r == '(' || r == ')' { + if index == 0 { + return nil + } + if keyword[index-1] != '\\' { + return nil + } + } + if r == '[' || r == '{' || r == '.' || r == '+' || r == '$' { + if index == 0 { + return nil + } + if keyword[index-1] != '\\' { + if r == '{' && (braceZero.MatchString(string(keyword[index:])) || braceZero2.MatchString(string(keyword[index:]))) { // r {0, ...} + return result[:len(result)-1] + } + + return + } + } + if r == '?' || r == '*' { + if index == 0 { + return nil + } + return result[:len(result)-1] + } + if r == '\\' || r == '\b' { + // TODO 将来更精细的处理 \d, \s, \$等 + break + } + + result = append(result, r) + } + return +} + +// 查找符号位置 +func (this *Regexp) indexOfSymbol(runes []rune, symbol rune) int { + for index, c := range runes { + if c == symbol && (index == 0 || runes[index-1] != '\\') { + return index + } + } + return -1 +} + +// 是否可视为为普通字符 +func (this *Regexp) isPlain(runes []rune) bool { + for _, r := range []rune{'|', '(', ')'} { + if this.indexOfSymbol(runes, r) >= 0 { + return false + } + } + return true +} diff --git a/internal/re/regexp_test.go b/internal/re/regexp_test.go new file mode 100644 index 0000000..e15aaee --- /dev/null +++ b/internal/re/regexp_test.go @@ -0,0 +1,120 @@ +// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved. + +package re_test + +import ( + "github.com/TeaOSLab/EdgeCommon/pkg/serverconfigs/firewallconfigs" + "github.com/TeaOSLab/EdgeNode/internal/re" + "github.com/iwind/TeaGo/assert" + "regexp" + "testing" +) + +func TestRegexp(t *testing.T) { + for _, s := range []string{"(?i)(abc|efg)", "abc|efg", "abc(.+)"} { + var reg = regexp.MustCompile(s) + t.Log("===" + s + "===") + t.Log(reg.LiteralPrefix()) + t.Log(reg.NumSubexp()) + t.Log(reg.SubexpNames()) + } +} + +func TestRegexp_MatchString(t *testing.T) { + var a = assert.NewAssertion(t) + + { + var r = re.MustCompile("abc") + a.IsTrue(r.MatchString("abc")) + a.IsFalse(r.MatchString("ab")) + } + + { + var r = re.MustCompile("(?i)abc|def|ghi") + a.IsTrue(r.MatchString("DEF")) + a.IsFalse(r.MatchString("ab")) + } +} + +func TestRegexp_Sub(t *testing.T) { + { + reg := regexp.MustCompile(`(a|b|c)(e|f|g)`) + for _, subName := range reg.SubexpNames() { + t.Log(subName) + } + } +} + +func TestRegexp_ParseKeywords(t *testing.T) { + var a = assert.NewAssertion(t) + + var r = re.MustCompile("") + a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)def"), []string{"abc"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)|(?:def)"), []string{})) + a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)|def"), []string{})) + a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)"), []string{"abc"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("(?i:abc)"), []string{})) + a.IsTrue(testCompareStrings(r.ParseKeywords("\babc"), []string{"abc"})) + a.IsTrue(testCompareStrings(r.ParseKeywords(" \babc"), []string{"abc"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("\babc\b"), []string{"abc"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("\b(abc)"), []string{"abc"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc"), []string{"abc"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc|efg|hij"), []string{"abc", "efg", "hij"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg|hij"), []string{"abc", "hij"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg*|hij"), []string{"abc", "hij"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg?|hij"), []string{"abc", "hij"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg+|hij"), []string{"abc", "hij"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg{2,10}|hij"), []string{"abc", "hij"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg{0,10}|hij"), []string{"abc", "hij"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg.+|hij"), []string{"abc", "hij"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("A(abc|bcd)"), []string{"abc", "bcd"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("^abc"), []string{"abc"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc$"), []string{"abc"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\$"), []string{"abc"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\d"), []string{"abc"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("abc{0,4}"), []string{"ab"})) + a.IsTrue(testCompareStrings(r.ParseKeywords("{0,4}"), []string{})) + a.IsTrue(testCompareStrings(r.ParseKeywords("{1,4}"), []string{})) + a.IsTrue(testCompareStrings(r.ParseKeywords("中文|北京|上海|golang"), []string{"中文", "北京", "上海", "golang"})) +} + +func TestRegexp_ParseKeywords2(t *testing.T) { + var r = re.MustCompile("") + + var policy = firewallconfigs.HTTPFirewallTemplate() + for _, group := range policy.Inbound.Groups { + for _, set := range group.Sets { + for _, rule := range set.Rules { + if rule.Operator == firewallconfigs.HTTPFirewallRuleOperatorMatch || rule.Operator == firewallconfigs.HTTPFirewallRuleOperatorNotMatch { + t.Log(set.Name+":", rule.Value, "=>", r.ParseKeywords(rule.Value)) + } + } + } + } +} + +func BenchmarkRegexp_MatchString(b *testing.B) { + var r = re.MustCompile("(?i)abc|def|ghi") + for i := 0; i < b.N; i++ { + r.MatchString("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36") + } +} + +func BenchmarkRegexp_MatchString2(b *testing.B) { + var r = regexp.MustCompile("(?i)abc|def|ghi") + for i := 0; i < b.N; i++ { + r.MatchString("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36") + } +} + +func testCompareStrings(s1 []string, s2 []string) bool { + if len(s1) != len(s2) { + return false + } + for index, s := range s1 { + if s != s2[index] { + return false + } + } + return true +} diff --git a/internal/re/rune_tree.go b/internal/re/rune_tree.go new file mode 100644 index 0000000..c72104a --- /dev/null +++ b/internal/re/rune_tree.go @@ -0,0 +1,74 @@ +// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved. + +package re + +type RuneMap map[rune]*RuneTree + +func (this *RuneMap) Lookup(s string, caseInsensitive bool) bool { + return this.lookup([]rune(s), caseInsensitive, 0) +} + +func (this RuneMap) lookup(runes []rune, caseInsensitive bool, depth int) bool { + if len(runes) == 0 { + return false + } + for i, r := range runes { + tree, ok := this[r] + if !ok { + if caseInsensitive { + if r >= 'a' && r <= 'z' { + r -= 32 + tree, ok = this[r] + } else if r >= 'A' && r <= 'Z' { + r += 32 + tree, ok = this[r] + } + } + if !ok { + if depth > 0 { + return false + } + continue + } + } + if tree.IsEnd { + return true + } + b := tree.Children.lookup(runes[i+1:], caseInsensitive, depth+1) + if b { + return true + } + } + return false +} + +type RuneTree struct { + Children RuneMap + IsEnd bool +} + +func NewRuneTree(list []string) RuneMap { + var rootMap = RuneMap{} + for _, s := range list { + if len(s) == 0 { + continue + } + + var lastMap = rootMap + var runes = []rune(s) + for index, r := range runes { + tree, ok := lastMap[r] + if !ok { + tree = &RuneTree{ + Children: RuneMap{}, + } + lastMap[r] = tree + } + if index == len(runes)-1 { + tree.IsEnd = true + } + lastMap = tree.Children + } + } + return rootMap +} diff --git a/internal/re/rune_tree_test.go b/internal/re/rune_tree_test.go new file mode 100644 index 0000000..341cb8c --- /dev/null +++ b/internal/re/rune_tree_test.go @@ -0,0 +1,47 @@ +// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved. + +package re_test + +import ( + "github.com/TeaOSLab/EdgeNode/internal/re" + "github.com/iwind/TeaGo/assert" + "regexp" + "testing" +) + +func TestNewRuneTree(t *testing.T) { + var a = assert.NewAssertion(t) + + var tree = re.NewRuneTree([]string{"abc", "abd", "def", "GHI", "中国", "@"}) + a.IsTrue(tree.Lookup("ABC", true)) + a.IsTrue(tree.Lookup("ABC1", true)) + a.IsTrue(tree.Lookup("1ABC", true)) + a.IsTrue(tree.Lookup("def", true)) + a.IsTrue(tree.Lookup("ghI", true)) + a.IsFalse(tree.Lookup("d ef", true)) + a.IsFalse(tree.Lookup("de", true)) + a.IsFalse(tree.Lookup("de f", true)) + a.IsTrue(tree.Lookup("我是中国人", true)) + a.IsTrue(tree.Lookup("iwind.liu@gmail.com", true)) +} + +func BenchmarkRuneMap_Lookup(b *testing.B) { + var tree = re.NewRuneTree([]string{"abc", "abd", "def", "ghi", "中国"}) + for i := 0; i < b.N; i++ { + tree.Lookup("我来自中国", true) + } +} + +func BenchmarkRuneMap_Lookup2_NOT_FOUND(b *testing.B) { + var tree = re.NewRuneTree([]string{"abc", "abd", "cde", "GHI"}) + for i := 0; i < b.N; i++ { + tree.Lookup("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36", true) + } +} + +func BenchmarkRune_Regexp_FOUND(b *testing.B) { + var reg = regexp.MustCompile("(?i)abc|abd|cde|GHI") + for i := 0; i < b.N; i++ { + reg.MatchString("HELLO WORLD ABC 123 456 abc HELLO WORLD HELLO WORLD ABC 123 456 abc HELLO WORLD HELLO WORLD ABC 123 456 abc HELLO WORLD") + } +} diff --git a/internal/waf/rule.go b/internal/waf/rule.go index 47a4c6b..06beebe 100644 --- a/internal/waf/rule.go +++ b/internal/waf/rule.go @@ -7,6 +7,7 @@ import ( "errors" "github.com/TeaOSLab/EdgeCommon/pkg/configutils" "github.com/TeaOSLab/EdgeCommon/pkg/serverconfigs/filterconfigs" + "github.com/TeaOSLab/EdgeNode/internal/re" "github.com/TeaOSLab/EdgeNode/internal/remotelogs" "github.com/TeaOSLab/EdgeNode/internal/waf/checkpoints" "github.com/TeaOSLab/EdgeNode/internal/waf/requests" @@ -44,7 +45,7 @@ type Rule struct { ipValue net.IP floatValue float64 - reg *regexp.Regexp + reg *re.Regexp } func NewRule() *Rule { @@ -74,7 +75,7 @@ func (this *Rule) Init() error { v = this.unescape(v) - reg, err := regexp.Compile(v) + reg, err := re.Compile(v) if err != nil { return err } @@ -87,7 +88,7 @@ func (this *Rule) Init() error { v = this.unescape(v) - reg, err := regexp.Compile(v) + reg, err := re.Compile(v) if err != nil { return err } diff --git a/internal/waf/utils/utils.go b/internal/waf/utils/utils.go index 26ca37e..c223ee1 100644 --- a/internal/waf/utils/utils.go +++ b/internal/waf/utils/utils.go @@ -2,19 +2,18 @@ package utils import ( "fmt" + "github.com/TeaOSLab/EdgeNode/internal/re" "github.com/TeaOSLab/EdgeNode/internal/ttlcache" "github.com/cespare/xxhash" "github.com/iwind/TeaGo/types" - "regexp" "strconv" "time" ) -//var grid = grids.NewGrid(32, grids.NewLimitCountOpt(1000_0000)) var cache = ttlcache.NewCache() -// 正则表达式匹配字符串,并缓存结果 -func MatchStringCache(regex *regexp.Regexp, s string) bool { +// MatchStringCache 正则表达式匹配字符串,并缓存结果 +func MatchStringCache(regex *re.Regexp, s string) bool { // 如果长度超过4096,大概率是不能重用的 if len(s) > 4096 { return regex.MatchString(s) @@ -35,8 +34,8 @@ func MatchStringCache(regex *regexp.Regexp, s string) bool { return b } -// 正则表达式匹配字节slice,并缓存结果 -func MatchBytesCache(regex *regexp.Regexp, byteSlice []byte) bool { +// MatchBytesCache 正则表达式匹配字节slice,并缓存结果 +func MatchBytesCache(regex *re.Regexp, byteSlice []byte) bool { // 如果长度超过4096,大概率是不能重用的 if len(byteSlice) > 4096 { return regex.Match(byteSlice)