Files
EdgeNode/internal/re/regexp.go

241 lines
4.8 KiB
Go
Raw Normal View History

// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved.
package re
import (
"regexp"
"strings"
)
var prefixReg = regexp.MustCompile(`^\(\?(\w+)\)`) // (?x)
var prefixReg2 = regexp.MustCompile(`^\(\?([\w\s]*:)`) // (?x: ...
var braceZero = regexp.MustCompile(`^{\s*0*\s*}`) // {0}
var braceZero2 = regexp.MustCompile(`^{\s*0*\s*,`) // {0, x}
type Regexp struct {
exp string
rawRegexp *regexp.Regexp
isStrict bool
isCaseInsensitive bool
keywordsMap RuneMap
}
func MustCompile(exp string) *Regexp {
var reg = &Regexp{
exp: exp,
rawRegexp: regexp.MustCompile(exp),
}
reg.init()
return reg
}
func Compile(exp string) (*Regexp, error) {
reg, err := regexp.Compile(exp)
if err != nil {
return nil, err
}
return NewRegexp(reg), nil
}
func NewRegexp(rawRegexp *regexp.Regexp) *Regexp {
var reg = &Regexp{
exp: rawRegexp.String(),
rawRegexp: rawRegexp,
}
reg.init()
return reg
}
func (this *Regexp) init() {
if len(this.exp) == 0 {
return
}
//var keywords = []string{}
var exp = strings.TrimSpace(this.exp)
// 去掉前面的(?...)
if prefixReg.MatchString(exp) {
var matches = prefixReg.FindStringSubmatch(exp)
var modifiers = matches[1]
if strings.Contains(modifiers, "i") {
this.isCaseInsensitive = true
}
exp = exp[len(matches[0]):]
}
var keywords = this.ParseKeywords(exp)
if len(keywords) > 0 {
this.keywordsMap = NewRuneTree(keywords)
}
}
func (this *Regexp) MatchString(s string) bool {
if this.keywordsMap != nil {
var b = this.keywordsMap.Lookup(s, this.isCaseInsensitive)
if !b {
return false
}
if this.isStrict {
return true
}
}
return this.rawRegexp.MatchString(s)
}
func (this *Regexp) Match(s []byte) bool {
if this.keywordsMap != nil {
var b = this.keywordsMap.Lookup(string(s), this.isCaseInsensitive)
if !b {
return false
}
if this.isStrict {
return true
}
}
return this.rawRegexp.Match(s)
}
// ParseKeywords 提取表达式中的关键词
// TODO 支持嵌套,类似于 A(abc|bcd)
// TODO 支持 (?:xxx)
// TODO 支持 abc)(bcd)(efg)
func (this *Regexp) ParseKeywords(exp string) []string {
var keywords = []string{}
if len(exp) == 0 {
return nil
}
var runes = []rune(exp)
// (a|b|c)
reg, err := regexp.Compile(exp)
if err == nil {
var countSub = reg.NumSubexp()
if countSub == 1 {
beginIndex := this.indexOfSymbol(runes, '(')
if beginIndex >= 0 {
runes = runes[beginIndex+1:]
symbolIndex := this.indexOfSymbol(runes, ')')
if symbolIndex > 0 && this.isPlain(runes[symbolIndex+1:]) {
runes = runes[:symbolIndex]
if len(runes) == 0 {
return nil
}
}
}
}
}
var lastIndex = 0
for index, r := range runes {
if r == '|' {
if index > 0 && runes[index-1] != '\\' {
var ks = this.parseKeyword(runes[lastIndex:index])
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
}
lastIndex = index + 1
}
}
}
if lastIndex == 0 {
var ks = this.parseKeyword(runes)
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
}
} else if lastIndex > 0 {
var ks = this.parseKeyword(runes[lastIndex:])
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
}
}
return keywords
}
func (this *Regexp) parseKeyword(keyword []rune) (result []rune) {
if len(keyword) == 0 {
return
}
// remove first \b
for index, r := range keyword {
if r == '\b' {
keyword = keyword[index+1:]
break
} else if r != '\t' && r != '\r' && r != '\n' && r != ' ' {
break
}
}
if len(keyword) == 0 {
return
}
for index, r := range keyword {
if index == 0 && r == '^' {
continue
}
if r == '(' || r == ')' {
if index == 0 {
return nil
}
if keyword[index-1] != '\\' {
return nil
}
}
if r == '[' || r == '{' || r == '.' || r == '+' || r == '$' {
if index == 0 {
return nil
}
if keyword[index-1] != '\\' {
if r == '{' && (braceZero.MatchString(string(keyword[index:])) || braceZero2.MatchString(string(keyword[index:]))) { // r {0, ...}
return result[:len(result)-1]
}
return
}
}
if r == '?' || r == '*' {
if index == 0 {
return nil
}
return result[:len(result)-1]
}
if r == '\\' || r == '\b' {
// TODO 将来更精细的处理 \d, \s, \$等
break
}
result = append(result, r)
}
return
}
// 查找符号位置
func (this *Regexp) indexOfSymbol(runes []rune, symbol rune) int {
for index, c := range runes {
if c == symbol && (index == 0 || runes[index-1] != '\\') {
return index
}
}
return -1
}
// 是否可视为为普通字符
func (this *Regexp) isPlain(runes []rune) bool {
for _, r := range []rune{'|', '(', ')'} {
if this.indexOfSymbol(runes, r) >= 0 {
return false
}
}
return true
}