优化代码

This commit is contained in:
GoEdgeLab
2022-02-24 16:44:28 +08:00
parent bfa0545fdc
commit 437308209b
3 changed files with 171 additions and 129 deletions

View File

@@ -4,13 +4,14 @@ package re
import (
"regexp"
"regexp/syntax"
"strings"
)
var prefixReg = regexp.MustCompile(`^\(\?([\w\s]+)\)`) // (?x)
var prefixReg2 = regexp.MustCompile(`^\(\?([\w\s]*:)`) // (?x: ...
var braceZero = regexp.MustCompile(`^{\s*0*\s*}`) // {0}
var braceZero2 = regexp.MustCompile(`^{\s*0*\s*,`) // {0, x}
var braceZeroReg = regexp.MustCompile(`^{\s*0*\s*}`) // {0}
var braceZeroReg2 = regexp.MustCompile(`^{\s*0*\s*,`) // {0, x}
type Regexp struct {
exp string
@@ -53,8 +54,6 @@ func (this *Regexp) init() {
return
}
//var keywords = []string{}
var exp = strings.TrimSpace(this.exp)
// 去掉前面的(?...)
@@ -68,9 +67,23 @@ func (this *Regexp) init() {
}
var keywords = this.ParseKeywords(exp)
this.keywords = keywords
if len(keywords) > 0 {
this.keywordsMap = NewRuneTree(keywords)
var filteredKeywords = []string{}
var minLength = 1
var isValid = true
for _, keyword := range keywords {
if len(keyword) <= minLength {
isValid = false
break
}
}
if isValid {
filteredKeywords = keywords
}
this.keywords = filteredKeywords
if len(filteredKeywords) > 0 {
this.keywordsMap = NewRuneTree(filteredKeywords)
}
}
@@ -96,6 +109,7 @@ func (this *Regexp) MatchString(s string) bool {
return true
}
}
return this.rawRegexp.MatchString(s)
}
@@ -113,104 +127,103 @@ func (this *Regexp) Match(s []byte) bool {
}
// ParseKeywords 提取表达式中的关键词
// TODO 支持嵌套,类似于 A(abc|bcd)
// TODO 支持 (?:xxx)
// TODO 支持 abc)(bcd)(efg)
func (this *Regexp) ParseKeywords(exp string) []string {
var keywords = []string{}
func (this *Regexp) ParseKeywords(exp string) (keywords []string) {
if len(exp) == 0 {
return nil
}
var runes = []rune(exp)
// (a|b|c)
reg, err := regexp.Compile(exp)
if err == nil {
var countSub = reg.NumSubexp()
if countSub == 1 {
beginIndex := this.indexOfSymbol(runes, '(')
if beginIndex >= 0 {
runes = runes[beginIndex+1:]
symbolIndex := this.indexOfSymbol(runes, ')')
if symbolIndex > 0 && this.isPlain(runes[symbolIndex+1:]) {
runes = runes[:symbolIndex]
if len(runes) == 0 {
reg, err := syntax.Parse(exp, syntax.Perl)
if err != nil {
return nil
}
}
}
}
}
var lastIndex = 0
for index, r := range runes {
if r == '|' {
if index > 0 && runes[index-1] != '\\' {
var ks = this.parseKeyword(runes[lastIndex:index])
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
if len(reg.Sub) == 0 {
var keywordRunes = this.parseKeyword(reg.String())
if len(keywordRunes) > 0 {
keywords = append(keywords, string(keywordRunes))
}
lastIndex = index + 1
}
}
}
if lastIndex == 0 {
var ks = this.parseKeyword(runes)
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
}
} else if lastIndex > 0 {
var ks = this.parseKeyword(runes[lastIndex:])
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
}
}
return keywords
}
func (this *Regexp) parseKeyword(keyword []rune) (result []rune) {
if len(keyword) == 0 {
return
}
// remove first \b
for index, r := range keyword {
if r == '\b' {
keyword = keyword[index+1:]
break
} else if r != '\t' && r != '\r' && r != '\n' && r != ' ' {
break
if len(reg.Sub) == 1 {
if reg.Op == syntax.OpStar || reg.Op == syntax.OpQuest || reg.Op == syntax.OpRepeat {
return nil
}
}
if len(keyword) == 0 {
return
return this.ParseKeywords(reg.Sub[0].String())
}
for index, r := range keyword {
if index == 0 && r == '^' {
switch reg.Op {
case syntax.OpConcat:
var prevKeywords = []string{}
var isStarted bool
for _, sub := range reg.Sub {
if sub.String() == `\b` {
if isStarted {
break
}
continue
}
if r == '(' || r == ')' {
if index == 0 {
if sub.Op != syntax.OpLiteral && sub.Op != syntax.OpCapture && sub.Op != syntax.OpAlternate {
if isStarted {
break
}
continue
}
var subKeywords = this.ParseKeywords(sub.String())
if len(subKeywords) > 0 {
if !isStarted {
prevKeywords = subKeywords
isStarted = true
} else {
for _, prevKeyword := range prevKeywords {
for _, subKeyword := range subKeywords {
keywords = append(keywords, prevKeyword+subKeyword)
}
}
prevKeywords = keywords
}
} else {
break
}
}
if len(prevKeywords) > 0 && len(keywords) == 0 {
keywords = prevKeywords
}
case syntax.OpAlternate:
for _, sub := range reg.Sub {
var subKeywords = this.ParseKeywords(sub.String())
if len(subKeywords) == 0 {
keywords = nil
return
}
keywords = append(keywords, subKeywords...)
}
}
return
}
func (this *Regexp) parseKeyword(subExp string) (result []rune) {
if len(subExp) == 0 {
return nil
}
if keyword[index-1] != '\\' {
return nil
// 去除开始和结尾的()
if subExp[0] == '(' && subExp[len(subExp)-1] == ')' {
subExp = subExp[1 : len(subExp)-1]
if len(subExp) == 0 {
return
}
}
var runes = []rune(subExp)
for index, r := range runes {
if r == '[' || r == '{' || r == '.' || r == '+' || r == '$' {
if index == 0 {
return nil
return
}
if keyword[index-1] != '\\' {
if r == '{' && (braceZero.MatchString(string(keyword[index:])) || braceZero2.MatchString(string(keyword[index:]))) { // r {0, ...}
if runes[index-1] != '\\' {
if r == '{' && (braceZeroReg.MatchString(subExp[index:])) || braceZeroReg2.MatchString(subExp[index:]) { // r {0, ...}
if len(result) == 0 {
return nil
}
@@ -222,39 +235,40 @@ func (this *Regexp) parseKeyword(keyword []rune) (result []rune) {
}
if r == '?' || r == '*' {
if index == 0 {
return nil
}
if len(result) == 0 {
return nil
return
}
if runes[index-1] != '\\' {
if len(result) > 0 {
return result[:len(result)-1]
}
if r == '\\' || r == '\b' {
// TODO 将来更精细的处理 \d, \s, \$等
break
return
}
}
if (r == 'n' || r == 't' || r == 'a' || r == 'f' || r == 'r' || r == 'v' || r == 'x') && index > 0 && runes[index-1] == '\\' {
switch r {
case 'n':
r = '\n'
case 't':
r = '\t'
case 'f':
r = '\f'
case 'r':
r = '\r'
case 'v':
r = '\v'
case 'a':
r = '\a'
case 'x':
return
}
}
if r == '\\' {
continue
}
result = append(result, r)
}
return
}
// 查找符号位置
func (this *Regexp) indexOfSymbol(runes []rune, symbol rune) int {
for index, c := range runes {
if c == symbol && (index == 0 || runes[index-1] != '\\') {
return index
}
}
return -1
}
// 是否可视为为普通字符
func (this *Regexp) isPlain(runes []rune) bool {
for _, r := range []rune{'|', '(', ')'} {
if this.indexOfSymbol(runes, r) >= 0 {
return false
}
}
return true
}

View File

@@ -7,6 +7,7 @@ import (
"github.com/TeaOSLab/EdgeNode/internal/re"
"github.com/iwind/TeaGo/assert"
"regexp"
"strings"
"testing"
)
@@ -27,12 +28,14 @@ func TestRegexp_MatchString(t *testing.T) {
var r = re.MustCompile("abc")
a.IsTrue(r.MatchString("abc"))
a.IsFalse(r.MatchString("ab"))
a.IsFalse(r.MatchString("ABC"))
}
{
var r = re.MustCompile("(?i)abc|def|ghi")
a.IsTrue(r.MatchString("DEF"))
a.IsFalse(r.MatchString("ab"))
a.IsTrue(r.MatchString("ABC"))
}
}
@@ -46,39 +49,50 @@ func TestRegexp_Sub(t *testing.T) {
}
func TestRegexp_ParseKeywords(t *testing.T) {
var r = re.MustCompile("")
{
var keywords = r.ParseKeywords(`\n\t\n\f\r\v\x123`)
t.Log(keywords)
}
}
func TestRegexp_ParseKeywords2(t *testing.T) {
var a = assert.NewAssertion(t)
var r = re.MustCompile("")
a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)def"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)|(?:def)"), []string{}))
a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)|def"), []string{}))
a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)def"), []string{"abcdef"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)|(?:def)"), []string{"abc", "def"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("(abc)"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("(abc|def|ghi)"), []string{"abc", "def", "ghi"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("(?i:abc)"), []string{}))
a.IsTrue(testCompareStrings(r.ParseKeywords("\babc"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(" \babc"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("\babc\b"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("\b(abc)"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`\babc`), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(` \babc`), []string{" "}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`\babc\b`), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`\b(abc)`), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc|efg|hij"), []string{"abc", "efg", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg*|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg?|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg+|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg{2,10}|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg{0,10}|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\|efg.+|hij"), []string{"abc", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("A(abc|bcd)"), []string{"abc", "bcd"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`abc\|efg|hij`), []string{"abc|efg", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`abc\|efg*|hij`), []string{"abc|ef", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`abc\|efg?|hij`), []string{"abc|ef", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`abc\|efg+|hij`), []string{"abc|ef", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`abc\|efg{2,10}|hij`), []string{"abc|ef", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`abc\|efg{0,10}|hij`), []string{"abc|ef", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`abc\|efg.+|hij`), []string{"abc|efg", "hij"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("A(abc|bcd)"), []string{"Aabc", "Abcd"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("^abc"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc$"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\$"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`abc$`), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc\\d"), []string{"abc"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("abc{0,4}"), []string{"ab"}))
a.IsTrue(testCompareStrings(r.ParseKeywords("{0,4}"), []string{}))
a.IsTrue(testCompareStrings(r.ParseKeywords("{1,4}"), []string{}))
a.IsTrue(testCompareStrings(r.ParseKeywords("中文|北京|上海|golang"), []string{"中文", "北京", "上海", "golang"}))
a.IsTrue(testCompareStrings(r.ParseKeywords(`(onmouseover|onmousemove|onmousedown|onmouseup|onerror|onload|onclick|ondblclick)\s*=`), strings.Split("onmouseover|onmousemove|onmousedown|onmouseup|onerror|onload|onclick|ondblclick", "|")))
a.IsTrue(testCompareStrings(r.ParseKeywords(`/\*(!|\x00)`), []string{"/*"}))
}
func TestRegexp_ParseKeywords2(t *testing.T) {
func TestRegexp_ParseKeywords3(t *testing.T) {
var r = re.MustCompile("")
var policy = firewallconfigs.HTTPFirewallTemplate()
@@ -94,14 +108,23 @@ func TestRegexp_ParseKeywords2(t *testing.T) {
}
func BenchmarkRegexp_MatchString(b *testing.B) {
var r = re.MustCompile("(?i)abc|def|ghi")
var r = re.MustCompile("(?i)(onmouseover|onmousemove|onmousedown|onmouseup|onerror|onload|onclick|ondblclick|onkeydown|onkeyup|onkeypress)(\\s|%09|%0A|(\\+|%20))*(=|%3D)")
//b.Log("keywords:", r.Keywords())
for i := 0; i < b.N; i++ {
r.MatchString("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
}
}
func BenchmarkRegexp_MatchString2(b *testing.B) {
var r = regexp.MustCompile("(?i)abc|def|ghi")
var r = regexp.MustCompile("(?i)(onmouseover|onmousemove|onmousedown|onmouseup|onerror|onload|onclick|ondblclick|onkeydown|onkeyup|onkeypress)(\\s|%09|%0A|(\\+|%20))*(=|%3D)")
for i := 0; i < b.N; i++ {
r.MatchString("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
}
}
func BenchmarkRegexp_MatchString_CaseSensitive(b *testing.B) {
var r = re.MustCompile("(abc|def|ghi)")
b.Log("keywords:", r.Keywords())
for i := 0; i < b.N; i++ {
r.MatchString("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
}

View File

@@ -25,6 +25,11 @@ func TestNewRuneTree(t *testing.T) {
a.IsTrue(tree.Lookup("iwind.liu@gmail.com", true))
}
func TestNewRuneTree2(t *testing.T) {
var tree = re.NewRuneTree([]string{"abc", "abd", "def", "GHI", "中国", "@"})
tree.Lookup("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36", true)
}
func BenchmarkRuneMap_Lookup(b *testing.B) {
var tree = re.NewRuneTree([]string{"abc", "abd", "def", "ghi", "中国"})
for i := 0; i < b.N; i++ {