Files
EdgeNode/internal/re/regexp.go

241 lines
4.8 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved.
package re
import (
"regexp"
"strings"
)
var prefixReg = regexp.MustCompile(`^\(\?(\w+)\)`) // (?x)
var prefixReg2 = regexp.MustCompile(`^\(\?([\w\s]*:)`) // (?x: ...
var braceZero = regexp.MustCompile(`^{\s*0*\s*}`) // {0}
var braceZero2 = regexp.MustCompile(`^{\s*0*\s*,`) // {0, x}
type Regexp struct {
exp string
rawRegexp *regexp.Regexp
isStrict bool
isCaseInsensitive bool
keywordsMap RuneMap
}
func MustCompile(exp string) *Regexp {
var reg = &Regexp{
exp: exp,
rawRegexp: regexp.MustCompile(exp),
}
reg.init()
return reg
}
func Compile(exp string) (*Regexp, error) {
reg, err := regexp.Compile(exp)
if err != nil {
return nil, err
}
return NewRegexp(reg), nil
}
func NewRegexp(rawRegexp *regexp.Regexp) *Regexp {
var reg = &Regexp{
exp: rawRegexp.String(),
rawRegexp: rawRegexp,
}
reg.init()
return reg
}
func (this *Regexp) init() {
if len(this.exp) == 0 {
return
}
//var keywords = []string{}
var exp = strings.TrimSpace(this.exp)
// 去掉前面的(?...)
if prefixReg.MatchString(exp) {
var matches = prefixReg.FindStringSubmatch(exp)
var modifiers = matches[1]
if strings.Contains(modifiers, "i") {
this.isCaseInsensitive = true
}
exp = exp[len(matches[0]):]
}
var keywords = this.ParseKeywords(exp)
if len(keywords) > 0 {
this.keywordsMap = NewRuneTree(keywords)
}
}
func (this *Regexp) MatchString(s string) bool {
if this.keywordsMap != nil {
var b = this.keywordsMap.Lookup(s, this.isCaseInsensitive)
if !b {
return false
}
if this.isStrict {
return true
}
}
return this.rawRegexp.MatchString(s)
}
func (this *Regexp) Match(s []byte) bool {
if this.keywordsMap != nil {
var b = this.keywordsMap.Lookup(string(s), this.isCaseInsensitive)
if !b {
return false
}
if this.isStrict {
return true
}
}
return this.rawRegexp.Match(s)
}
// ParseKeywords 提取表达式中的关键词
// TODO 支持嵌套,类似于 A(abc|bcd)
// TODO 支持 (?:xxx)
// TODO 支持 abc)(bcd)(efg)
func (this *Regexp) ParseKeywords(exp string) []string {
var keywords = []string{}
if len(exp) == 0 {
return nil
}
var runes = []rune(exp)
// (a|b|c)
reg, err := regexp.Compile(exp)
if err == nil {
var countSub = reg.NumSubexp()
if countSub == 1 {
beginIndex := this.indexOfSymbol(runes, '(')
if beginIndex >= 0 {
runes = runes[beginIndex+1:]
symbolIndex := this.indexOfSymbol(runes, ')')
if symbolIndex > 0 && this.isPlain(runes[symbolIndex+1:]) {
runes = runes[:symbolIndex]
if len(runes) == 0 {
return nil
}
}
}
}
}
var lastIndex = 0
for index, r := range runes {
if r == '|' {
if index > 0 && runes[index-1] != '\\' {
var ks = this.parseKeyword(runes[lastIndex:index])
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
}
lastIndex = index + 1
}
}
}
if lastIndex == 0 {
var ks = this.parseKeyword(runes)
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
}
} else if lastIndex > 0 {
var ks = this.parseKeyword(runes[lastIndex:])
if len(ks) > 0 {
keywords = append(keywords, string(ks))
} else {
return nil
}
}
return keywords
}
func (this *Regexp) parseKeyword(keyword []rune) (result []rune) {
if len(keyword) == 0 {
return
}
// remove first \b
for index, r := range keyword {
if r == '\b' {
keyword = keyword[index+1:]
break
} else if r != '\t' && r != '\r' && r != '\n' && r != ' ' {
break
}
}
if len(keyword) == 0 {
return
}
for index, r := range keyword {
if index == 0 && r == '^' {
continue
}
if r == '(' || r == ')' {
if index == 0 {
return nil
}
if keyword[index-1] != '\\' {
return nil
}
}
if r == '[' || r == '{' || r == '.' || r == '+' || r == '$' {
if index == 0 {
return nil
}
if keyword[index-1] != '\\' {
if r == '{' && (braceZero.MatchString(string(keyword[index:])) || braceZero2.MatchString(string(keyword[index:]))) { // r {0, ...}
return result[:len(result)-1]
}
return
}
}
if r == '?' || r == '*' {
if index == 0 {
return nil
}
return result[:len(result)-1]
}
if r == '\\' || r == '\b' {
// TODO 将来更精细的处理 \d, \s, \$等
break
}
result = append(result, r)
}
return
}
// 查找符号位置
func (this *Regexp) indexOfSymbol(runes []rune, symbol rune) int {
for index, c := range runes {
if c == symbol && (index == 0 || runes[index-1] != '\\') {
return index
}
}
return -1
}
// 是否可视为为普通字符
func (this *Regexp) isPlain(runes []rune) bool {
for _, r := range []rune{'|', '(', ')'} {
if this.indexOfSymbol(runes, r) >= 0 {
return false
}
}
return true
}