EdgeNode/internal/re/regexp.go

// Copyright 2022 Liuxiangchao iwind.liu@gmail.com. All rights reserved.

package re

import (
	"regexp"
	"strings"
)

var prefixReg = regexp.MustCompile(`^\(\?(\w+)\)`)     // (?x)
var prefixReg2 = regexp.MustCompile(`^\(\?([\w\s]*:)`) // (?x: ...
var braceZero = regexp.MustCompile(`^{\s*0*\s*}`)      // {0}
var braceZero2 = regexp.MustCompile(`^{\s*0*\s*,`)     // {0, x}

type Regexp struct {
	exp       string
	rawRegexp *regexp.Regexp

	isStrict          bool
	isCaseInsensitive bool
	keywordsMap       RuneMap
}

func MustCompile(exp string) *Regexp {
	var reg = &Regexp{
		exp:       exp,
		rawRegexp: regexp.MustCompile(exp),
	}
	reg.init()
	return reg
}

func Compile(exp string) (*Regexp, error) {
	reg, err := regexp.Compile(exp)
	if err != nil {
		return nil, err
	}
	return NewRegexp(reg), nil
}

func NewRegexp(rawRegexp *regexp.Regexp) *Regexp {
	var reg = &Regexp{
		exp:       rawRegexp.String(),
		rawRegexp: rawRegexp,
	}
	reg.init()
	return reg
}

func (this *Regexp) init() {
	if len(this.exp) == 0 {
		return
	}

	//var keywords = []string{}

	var exp = strings.TrimSpace(this.exp)

	// 去掉前面的(?...)
	if prefixReg.MatchString(exp) {
		var matches = prefixReg.FindStringSubmatch(exp)
		var modifiers = matches[1]
		if strings.Contains(modifiers, "i") {
			this.isCaseInsensitive = true
		}
		exp = exp[len(matches[0]):]
	}

	var keywords = this.ParseKeywords(exp)
	if len(keywords) > 0 {
		this.keywordsMap = NewRuneTree(keywords)
	}
}

func (this *Regexp) MatchString(s string) bool {
	if this.keywordsMap != nil {
		var b = this.keywordsMap.Lookup(s, this.isCaseInsensitive)
		if !b {
			return false
		}
		if this.isStrict {
			return true
		}
	}
	return this.rawRegexp.MatchString(s)
}

func (this *Regexp) Match(s []byte) bool {
	if this.keywordsMap != nil {
		var b = this.keywordsMap.Lookup(string(s), this.isCaseInsensitive)
		if !b {
			return false
		}
		if this.isStrict {
			return true
		}
	}
	return this.rawRegexp.Match(s)
}

// ParseKeywords 提取表达式中的关键词
// TODO 支持嵌套，类似于 A(abc|bcd)
// TODO 支持 (?:xxx)
// TODO 支持  （abc)(bcd)(efg)
func (this *Regexp) ParseKeywords(exp string) []string {
	var keywords = []string{}
	if len(exp) == 0 {
		return nil
	}

	var runes = []rune(exp)

	// (a|b|c)
	reg, err := regexp.Compile(exp)
	if err == nil {
		var countSub = reg.NumSubexp()
		if countSub == 1 {
			beginIndex := this.indexOfSymbol(runes, '(')
			if beginIndex >= 0 {
				runes = runes[beginIndex+1:]
				symbolIndex := this.indexOfSymbol(runes, ')')
				if symbolIndex > 0 && this.isPlain(runes[symbolIndex+1:]) {
					runes = runes[:symbolIndex]
					if len(runes) == 0 {
						return nil
					}
				}
			}
		}
	}

	var lastIndex = 0
	for index, r := range runes {
		if r == '|' {
			if index > 0 && runes[index-1] != '\\' {
				var ks = this.parseKeyword(runes[lastIndex:index])
				if len(ks) > 0 {
					keywords = append(keywords, string(ks))
				} else {
					return nil
				}
				lastIndex = index + 1
			}
		}
	}
	if lastIndex == 0 {
		var ks = this.parseKeyword(runes)
		if len(ks) > 0 {
			keywords = append(keywords, string(ks))
		} else {
			return nil
		}
	} else if lastIndex > 0 {
		var ks = this.parseKeyword(runes[lastIndex:])
		if len(ks) > 0 {
			keywords = append(keywords, string(ks))
		} else {
			return nil
		}
	}
	return keywords
}

func (this *Regexp) parseKeyword(keyword []rune) (result []rune) {
	if len(keyword) == 0 {
		return
	}

	// remove first \b
	for index, r := range keyword {
		if r == '\b' {
			keyword = keyword[index+1:]
			break
		} else if r != '\t' && r != '\r' && r != '\n' && r != ' ' {
			break
		}
	}
	if len(keyword) == 0 {
		return
	}

	for index, r := range keyword {
		if index == 0 && r == '^' {
			continue
		}
		if r == '(' || r == ')' {
			if index == 0 {
				return nil
			}
			if keyword[index-1] != '\\' {
				return nil
			}
		}
		if r == '[' || r == '{' || r == '.' || r == '+' || r == '$' {
			if index == 0 {
				return nil
			}
			if keyword[index-1] != '\\' {
				if r == '{' && (braceZero.MatchString(string(keyword[index:])) || braceZero2.MatchString(string(keyword[index:]))) { // r {0, ...}
					return result[:len(result)-1]
				}

				return
			}
		}
		if r == '?' || r == '*' {
			if index == 0 {
				return nil
			}
			return result[:len(result)-1]
		}
		if r == '\\' || r == '\b' {
			// TODO 将来更精细的处理 \d, \s, \$等
			break
		}

		result = append(result, r)
	}
	return
}

// 查找符号位置
func (this *Regexp) indexOfSymbol(runes []rune, symbol rune) int {
	for index, c := range runes {
		if c == symbol && (index == 0 || runes[index-1] != '\\') {
			return index
		}
	}
	return -1
}

// 是否可视为为普通字符
func (this *Regexp) isPlain(runes []rune) bool {
	for _, r := range []rune{'|', '(', ')'} {
		if this.indexOfSymbol(runes, r) >= 0 {
			return false
		}
	}
	return true
}