提升单词匹配性能

This commit is contained in:
刘祥超
2023-12-09 10:06:07 +08:00
parent e8638e4bec
commit 536efeeb9c
2 changed files with 81 additions and 8 deletions

View File

@@ -9,11 +9,31 @@ func ContainsAnyWord(s string, words []string, isCaseInsensitive bool) bool {
return false return false
} }
var lastRune rune // last searching rune in s
var lastIndex = -2 // -2: not started, -1: not found, >=0: rune index
for _, word := range words { for _, word := range words {
if ContainsWordRunes(allRunes, []rune(word), isCaseInsensitive) { var wordRunes = []rune(word)
if len(wordRunes) == 0 {
continue
}
if lastIndex > -2 && lastRune == wordRunes[0] {
if lastIndex >= 0 {
result, _ := ContainsWordRunes(allRunes[lastIndex:], wordRunes, isCaseInsensitive)
if result {
return true return true
} }
} }
continue
} else {
result, firstIndex := ContainsWordRunes(allRunes, wordRunes, isCaseInsensitive)
lastIndex = firstIndex
if result {
return true
}
}
lastRune = wordRunes[0]
}
return false return false
} }
@@ -25,7 +45,7 @@ func ContainsAllWords(s string, words []string, isCaseInsensitive bool) bool {
} }
for _, word := range words { for _, word := range words {
if !ContainsWordRunes(allRunes, []rune(word), isCaseInsensitive) { if result, _ := ContainsWordRunes(allRunes, []rune(word), isCaseInsensitive); !result {
return false return false
} }
} }
@@ -33,16 +53,22 @@ func ContainsAllWords(s string, words []string, isCaseInsensitive bool) bool {
} }
// ContainsWordRunes 检查字符列表是否包含某个单词子字符列表 // ContainsWordRunes 检查字符列表是否包含某个单词子字符列表
func ContainsWordRunes(allRunes []rune, subRunes []rune, isCaseInsensitive bool) bool { func ContainsWordRunes(allRunes []rune, subRunes []rune, isCaseInsensitive bool) (result bool, firstIndex int) {
firstIndex = -1
var l = len(subRunes) var l = len(subRunes)
if l == 0 { if l == 0 {
return false return false, 0
} }
var al = len(allRunes) var al = len(allRunes)
for index, r := range allRunes { for index, r := range allRunes {
if EqualRune(r, subRunes[0], isCaseInsensitive) && (index == 0 || !isChar(allRunes[index-1]) /**boundary check **/) { if EqualRune(r, subRunes[0], isCaseInsensitive) && (index == 0 || !isChar(allRunes[index-1]) /**boundary check **/) {
if firstIndex < 0 {
firstIndex = index
}
var found = true var found = true
if l > 1 { if l > 1 {
for i := 1; i < l; i++ { for i := 1; i < l; i++ {
@@ -56,12 +82,12 @@ func ContainsWordRunes(allRunes []rune, subRunes []rune, isCaseInsensitive bool)
// check after charset // check after charset
if found && (al <= index+l || !isChar(allRunes[index+l]) /**boundary check **/) { if found && (al <= index+l || !isChar(allRunes[index+l]) /**boundary check **/) {
return true return true, firstIndex
} }
} }
} }
return false return false, firstIndex
} }
// ContainsSubRunes 检查字符列表是否包含某个子子字符列表 // ContainsSubRunes 检查字符列表是否包含某个子子字符列表

View File

@@ -3,9 +3,13 @@
package runes_test package runes_test
import ( import (
"github.com/TeaOSLab/EdgeNode/internal/re"
"github.com/TeaOSLab/EdgeNode/internal/utils/runes" "github.com/TeaOSLab/EdgeNode/internal/utils/runes"
"github.com/iwind/TeaGo/assert" "github.com/iwind/TeaGo/assert"
"regexp"
"runtime" "runtime"
"sort"
"strings"
"testing" "testing"
) )
@@ -25,6 +29,11 @@ func TestContainsAnyWord(t *testing.T) {
a.IsTrue(runes.ContainsAnyWord("How are you?", []string{"how", "ok"}, true)) a.IsTrue(runes.ContainsAnyWord("How are you?", []string{"how", "ok"}, true))
} }
func TestContainsAnyWord_Sort(t *testing.T) {
var a = assert.NewAssertion(t)
a.IsTrue(runes.ContainsAnyWord("How are you?", []string{"abc", "ant", "arm", "Hit", "Hi", "Pet", "pie", "are"}, false))
}
func TestContainsWordRunes(t *testing.T) { func TestContainsWordRunes(t *testing.T) {
var a = assert.NewAssertion(t) var a = assert.NewAssertion(t)
a.IsFalse(runes.ContainsWordRunes([]rune(""), []rune("How"), true)) a.IsFalse(runes.ContainsWordRunes([]rune(""), []rune("How"), true))
@@ -81,7 +90,45 @@ func BenchmarkContainsWordRunes(b *testing.B) {
b.RunParallel(func(pb *testing.PB) { b.RunParallel(func(pb *testing.PB) {
for pb.Next() { for pb.Next() {
_ = runes.ContainsWordRunes([]rune("How are you"), []rune("YOU"), true) _, _ = runes.ContainsWordRunes([]rune("How are you"), []rune("YOU"), true)
}
})
}
func BenchmarkContainsAnyWord(b *testing.B) {
runtime.GOMAXPROCS(4)
var words = strings.Split("python\npycurl\nhttp-client\nhttpclient\napachebench\nnethttp\nhttp_request\njava\nperl\nruby\nscrapy\nphp\nrust", "\n")
sort.Strings(words)
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
_ = runes.ContainsAnyWord("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_0_0) AppleWebKit/500.00 (KHTML, like Gecko) Chrome/100.0.0.0", words, true)
}
})
}
func BenchmarkContainsAnyWord_Regexp(b *testing.B) {
runtime.GOMAXPROCS(4)
var reg = regexp.MustCompile("(?i)" + strings.ReplaceAll("python\npycurl\nhttp-client\nhttpclient\napachebench\nnethttp\nhttp_request\njava\nperl\nruby\nscrapy\nphp\nrust", "\n", "|"))
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
_ = reg.MatchString("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_0_0) AppleWebKit/500.00 (KHTML, like Gecko) Chrome/100.0.0.0")
}
})
}
func BenchmarkContainsAnyWord_Re(b *testing.B) {
runtime.GOMAXPROCS(4)
var reg = re.MustCompile("(?i)" + strings.ReplaceAll("python\npycurl\nhttp-client\nhttpclient\napachebench\nnethttp\nhttp_request\njava\nperl\nruby\nscrapy\nphp\nrust", "\n", "|"))
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
_ = reg.MatchString("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_0_0) AppleWebKit/500.00 (KHTML, like Gecko) Chrome/100.0.0.0")
} }
}) })
} }