Files
EdgeNode/internal/re/regexp.go

292 lines
5.6 KiB
Go
Raw Normal View History

// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved.
package re
import (
2022-04-04 18:25:54 +08:00
"github.com/iwind/TeaGo/types"
"regexp"
2022-02-24 16:44:28 +08:00
"regexp/syntax"
"strings"
2022-04-04 18:25:54 +08:00
"sync/atomic"
)
var prefixReg = regexp.MustCompile(`^\(\?([\w\s]+)\)`) // (?x)
2022-02-24 16:44:28 +08:00
var braceZeroReg = regexp.MustCompile(`^{\s*0*\s*}`) // {0}
var braceZeroReg2 = regexp.MustCompile(`^{\s*0*\s*,`) // {0, x}
2022-04-04 18:25:54 +08:00
var lastId uint64
type Regexp struct {
exp string
rawRegexp *regexp.Regexp
isStrict bool
isCaseInsensitive bool
keywords []string
keywordsMap RuneMap
2022-04-04 18:25:54 +08:00
id uint64
idString string
}
func MustCompile(exp string) *Regexp {
var reg = &Regexp{
exp: exp,
rawRegexp: regexp.MustCompile(exp),
}
reg.init()
return reg
}
func Compile(exp string) (*Regexp, error) {
reg, err := regexp.Compile(exp)
if err != nil {
return nil, err
}
return NewRegexp(reg), nil
}
func NewRegexp(rawRegexp *regexp.Regexp) *Regexp {
var reg = &Regexp{
exp: rawRegexp.String(),
rawRegexp: rawRegexp,
}
reg.init()
return reg
}
func (this *Regexp) init() {
2022-04-04 18:25:54 +08:00
this.id = atomic.AddUint64(&lastId, 1)
this.idString = "re:" + types.String(this.id)
if len(this.exp) == 0 {
return
}
var exp = strings.TrimSpace(this.exp)
// 去掉前面的(?...)
if prefixReg.MatchString(exp) {
var matches = prefixReg.FindStringSubmatch(exp)
var modifiers = matches[1]
if strings.Contains(modifiers, "i") {
this.isCaseInsensitive = true
}
exp = exp[len(matches[0]):]
}
var keywords = this.ParseKeywords(exp)
2022-02-24 16:44:28 +08:00
var filteredKeywords = []string{}
var minLength = 1
var isValid = true
for _, keyword := range keywords {
if len(keyword) <= minLength {
isValid = false
break
}
}
if isValid {
filteredKeywords = keywords
}
this.keywords = filteredKeywords
if len(filteredKeywords) > 0 {
this.keywordsMap = NewRuneTree(filteredKeywords)
}
}
func (this *Regexp) Keywords() []string {
return this.keywords
}
func (this *Regexp) Raw() *regexp.Regexp {
return this.rawRegexp
}
func (this *Regexp) IsCaseInsensitive() bool {
return this.isCaseInsensitive
}
func (this *Regexp) MatchString(s string) bool {
if this.keywordsMap != nil {
var b = this.keywordsMap.Lookup(s, this.isCaseInsensitive)
if !b {
return false
}
if this.isStrict {
return true
}
}
2022-02-24 16:44:28 +08:00
return this.rawRegexp.MatchString(s)
}
func (this *Regexp) Match(s []byte) bool {
if this.keywordsMap != nil {
var b = this.keywordsMap.Lookup(string(s), this.isCaseInsensitive)
if !b {
return false
}
if this.isStrict {
return true
}
}
return this.rawRegexp.Match(s)
}
2022-07-16 14:48:57 +08:00
func (this *Regexp) FindStringSubmatch(s string) []string {
return this.rawRegexp.FindStringSubmatch(s)
}
// ParseKeywords 提取表达式中的关键词
2022-02-24 16:44:28 +08:00
func (this *Regexp) ParseKeywords(exp string) (keywords []string) {
if len(exp) == 0 {
return nil
}
2022-02-24 16:44:28 +08:00
reg, err := syntax.Parse(exp, syntax.Perl)
if err != nil {
return nil
}
2022-02-24 16:44:28 +08:00
if len(reg.Sub) == 0 {
var keywordRunes = this.parseKeyword(reg.String())
if len(keywordRunes) > 0 {
keywords = append(keywords, string(keywordRunes))
}
return
}
if len(reg.Sub) == 1 {
if reg.Op == syntax.OpStar || reg.Op == syntax.OpQuest || reg.Op == syntax.OpRepeat {
return nil
}
2022-02-24 16:44:28 +08:00
return this.ParseKeywords(reg.Sub[0].String())
}
2022-02-24 16:44:28 +08:00
switch reg.Op {
case syntax.OpConcat:
var prevKeywords = []string{}
var isStarted bool
for _, sub := range reg.Sub {
if sub.String() == `\b` {
if isStarted {
break
}
continue
}
if sub.Op != syntax.OpLiteral && sub.Op != syntax.OpCapture && sub.Op != syntax.OpAlternate {
if isStarted {
break
}
continue
}
var subKeywords = this.ParseKeywords(sub.String())
if len(subKeywords) > 0 {
if !isStarted {
prevKeywords = subKeywords
isStarted = true
} else {
2022-02-24 16:44:28 +08:00
for _, prevKeyword := range prevKeywords {
for _, subKeyword := range subKeywords {
keywords = append(keywords, prevKeyword+subKeyword)
}
}
prevKeywords = keywords
}
2022-02-24 16:44:28 +08:00
} else {
break
}
}
2022-02-24 16:44:28 +08:00
if len(prevKeywords) > 0 && len(keywords) == 0 {
keywords = prevKeywords
}
2022-02-24 16:44:28 +08:00
case syntax.OpAlternate:
for _, sub := range reg.Sub {
var subKeywords = this.ParseKeywords(sub.String())
if len(subKeywords) == 0 {
keywords = nil
return
}
keywords = append(keywords, subKeywords...)
}
}
2022-02-24 16:44:28 +08:00
return
}
2022-04-04 18:25:54 +08:00
func (this *Regexp) IdString() string {
return this.idString
}
2022-02-24 16:44:28 +08:00
func (this *Regexp) parseKeyword(subExp string) (result []rune) {
if len(subExp) == 0 {
return nil
}
2022-02-24 16:44:28 +08:00
// 去除开始和结尾的()
if subExp[0] == '(' && subExp[len(subExp)-1] == ')' {
subExp = subExp[1 : len(subExp)-1]
if len(subExp) == 0 {
return
}
}
2022-02-24 16:44:28 +08:00
var runes = []rune(subExp)
for index, r := range runes {
if r == '[' || r == '{' || r == '.' || r == '+' || r == '$' {
if index == 0 {
2022-02-24 16:44:28 +08:00
return
}
2022-02-24 16:44:28 +08:00
if runes[index-1] != '\\' {
if r == '{' && (braceZeroReg.MatchString(subExp[index:])) || braceZeroReg2.MatchString(subExp[index:]) { // r {0, ...}
if len(result) == 0 {
return nil
}
return result[:len(result)-1]
}
return
}
}
if r == '?' || r == '*' {
if index == 0 {
2022-02-24 16:44:28 +08:00
return
}
2022-02-24 16:44:28 +08:00
if runes[index-1] != '\\' {
if len(result) > 0 {
return result[:len(result)-1]
}
return
}
}
2022-02-24 16:44:28 +08:00
if (r == 'n' || r == 't' || r == 'a' || r == 'f' || r == 'r' || r == 'v' || r == 'x') && index > 0 && runes[index-1] == '\\' {
switch r {
case 'n':
r = '\n'
case 't':
r = '\t'
case 'f':
r = '\f'
case 'r':
r = '\r'
case 'v':
r = '\v'
case 'a':
r = '\a'
case 'x':
return
}
}
2022-02-24 16:44:28 +08:00
if r == '\\' {
continue
}
2022-02-24 16:44:28 +08:00
result = append(result, r)
}
2022-02-24 16:44:28 +08:00
return
}