gitea/vendor/github.com/dlclark/regexp2/syntax/parser.go
mrsdizzie af7ffaa279
Server-side syntax highlighting for all code (#12047)
* Server-side syntax hilighting for all code

This PR does a few things:

* Remove all traces of highlight.js
* Use chroma library to provide fast syntax hilighting directly on the server
* Provide syntax hilighting for diffs
* Re-style both unified and split diffs views
* Add custom syntax hilighting styling for both regular and arc-green

Fixes #7729
Fixes #10157
Fixes #11825
Fixes #7728
Fixes #3872
Fixes #3682

And perhaps gets closer to #9553

* fix line marker

* fix repo search

* Fix single line select

* properly load settings

* npm uninstall highlight.js

* review suggestion

* code review

* forgot to call function

* fix test

* Apply suggestions from code review

suggestions from @silverwind thanks

Co-authored-by: silverwind <me@silverwind.io>

* code review

* copy/paste error

* Use const for highlight size limit

* Update web_src/less/_repository.less

Co-authored-by: Lauris BH <lauris@nix.lv>

* update size limit to 1MB and other styling tweaks

* fix highlighting for certain diff sections

* fix test

* add worker back as suggested

Co-authored-by: silverwind <me@silverwind.io>
Co-authored-by: Lauris BH <lauris@nix.lv>
2020-07-01 00:34:03 +03:00

2202 lines
49 KiB
Go

package syntax
import (
"fmt"
"math"
"os"
"sort"
"strconv"
"unicode"
)
type RegexOptions int32
const (
IgnoreCase RegexOptions = 0x0001 // "i"
Multiline = 0x0002 // "m"
ExplicitCapture = 0x0004 // "n"
Compiled = 0x0008 // "c"
Singleline = 0x0010 // "s"
IgnorePatternWhitespace = 0x0020 // "x"
RightToLeft = 0x0040 // "r"
Debug = 0x0080 // "d"
ECMAScript = 0x0100 // "e"
RE2 = 0x0200 // RE2 compat mode
)
func optionFromCode(ch rune) RegexOptions {
// case-insensitive
switch ch {
case 'i', 'I':
return IgnoreCase
case 'r', 'R':
return RightToLeft
case 'm', 'M':
return Multiline
case 'n', 'N':
return ExplicitCapture
case 's', 'S':
return Singleline
case 'x', 'X':
return IgnorePatternWhitespace
case 'd', 'D':
return Debug
case 'e', 'E':
return ECMAScript
default:
return 0
}
}
// An Error describes a failure to parse a regular expression
// and gives the offending expression.
type Error struct {
Code ErrorCode
Expr string
Args []interface{}
}
func (e *Error) Error() string {
if len(e.Args) == 0 {
return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
}
return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
}
// An ErrorCode describes a failure to parse a regular expression.
type ErrorCode string
const (
// internal issue
ErrInternalError ErrorCode = "regexp/syntax: internal error"
// Parser errors
ErrUnterminatedComment = "unterminated comment"
ErrInvalidCharRange = "invalid character class range"
ErrInvalidRepeatSize = "invalid repeat count"
ErrInvalidUTF8 = "invalid UTF-8"
ErrCaptureGroupOutOfRange = "capture group number out of range"
ErrUnexpectedParen = "unexpected )"
ErrMissingParen = "missing closing )"
ErrMissingBrace = "missing closing }"
ErrInvalidRepeatOp = "invalid nested repetition operator"
ErrMissingRepeatArgument = "missing argument to repetition operator"
ErrConditionalExpression = "illegal conditional (?(...)) expression"
ErrTooManyAlternates = "too many | in (?()|)"
ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
ErrCapNumNotZero = "capture number cannot be zero"
ErrUndefinedBackRef = "reference to undefined group number %v"
ErrUndefinedNameRef = "reference to undefined group name %v"
ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
ErrMalformedReference = "(?(%v) ) malformed"
ErrUndefinedReference = "(?(%v) ) reference to undefined group"
ErrIllegalEndEscape = "illegal \\ at end of pattern"
ErrMalformedSlashP = "malformed \\p{X} character escape"
ErrIncompleteSlashP = "incomplete \\p{X} character escape"
ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
ErrMissingControl = "missing control character"
ErrUnrecognizedControl = "unrecognized control character"
ErrTooFewHex = "insufficient hexadecimal digits"
ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
ErrMalformedNameRef = "malformed \\k<...> named back reference"
ErrBadClassInCharRange = "cannot include class \\%v in character range"
ErrUnterminatedBracket = "unterminated [] set"
ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
ErrReversedCharRange = "[x-y] range in reverse order"
)
func (e ErrorCode) String() string {
return string(e)
}
type parser struct {
stack *regexNode
group *regexNode
alternation *regexNode
concatenation *regexNode
unit *regexNode
patternRaw string
pattern []rune
currentPos int
specialCase *unicode.SpecialCase
autocap int
capcount int
captop int
capsize int
caps map[int]int
capnames map[string]int
capnumlist []int
capnamelist []string
options RegexOptions
optionsStack []RegexOptions
ignoreNextParen bool
}
const (
maxValueDiv10 int = math.MaxInt32 / 10
maxValueMod10 = math.MaxInt32 % 10
)
// Parse converts a regex string into a parse tree
func Parse(re string, op RegexOptions) (*RegexTree, error) {
p := parser{
options: op,
caps: make(map[int]int),
}
p.setPattern(re)
if err := p.countCaptures(); err != nil {
return nil, err
}
p.reset(op)
root, err := p.scanRegex()
if err != nil {
return nil, err
}
tree := &RegexTree{
root: root,
caps: p.caps,
capnumlist: p.capnumlist,
captop: p.captop,
Capnames: p.capnames,
Caplist: p.capnamelist,
options: op,
}
if tree.options&Debug > 0 {
os.Stdout.WriteString(tree.Dump())
}
return tree, nil
}
func (p *parser) setPattern(pattern string) {
p.patternRaw = pattern
p.pattern = make([]rune, 0, len(pattern))
//populate our rune array to handle utf8 encoding
for _, r := range pattern {
p.pattern = append(p.pattern, r)
}
}
func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
return &Error{Code: code, Expr: p.patternRaw, Args: args}
}
func (p *parser) noteCaptureSlot(i, pos int) {
if _, ok := p.caps[i]; !ok {
// the rhs of the hashtable isn't used in the parser
p.caps[i] = pos
p.capcount++
if p.captop <= i {
if i == math.MaxInt32 {
p.captop = i
} else {
p.captop = i + 1
}
}
}
}
func (p *parser) noteCaptureName(name string, pos int) {
if p.capnames == nil {
p.capnames = make(map[string]int)
}
if _, ok := p.capnames[name]; !ok {
p.capnames[name] = pos
p.capnamelist = append(p.capnamelist, name)
}
}
func (p *parser) assignNameSlots() {
if p.capnames != nil {
for _, name := range p.capnamelist {
for p.isCaptureSlot(p.autocap) {
p.autocap++
}
pos := p.capnames[name]
p.capnames[name] = p.autocap
p.noteCaptureSlot(p.autocap, pos)
p.autocap++
}
}
// if the caps array has at least one gap, construct the list of used slots
if p.capcount < p.captop {
p.capnumlist = make([]int, p.capcount)
i := 0
for k := range p.caps {
p.capnumlist[i] = k
i++
}
sort.Ints(p.capnumlist)
}
// merge capsnumlist into capnamelist
if p.capnames != nil || p.capnumlist != nil {
var oldcapnamelist []string
var next int
var k int
if p.capnames == nil {
oldcapnamelist = nil
p.capnames = make(map[string]int)
p.capnamelist = []string{}
next = -1
} else {
oldcapnamelist = p.capnamelist
p.capnamelist = []string{}
next = p.capnames[oldcapnamelist[0]]
}
for i := 0; i < p.capcount; i++ {
j := i
if p.capnumlist != nil {
j = p.capnumlist[i]
}
if next == j {
p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
k++
if k == len(oldcapnamelist) {
next = -1
} else {
next = p.capnames[oldcapnamelist[k]]
}
} else {
//feature: culture?
str := strconv.Itoa(j)
p.capnamelist = append(p.capnamelist, str)
p.capnames[str] = j
}
}
}
}
func (p *parser) consumeAutocap() int {
r := p.autocap
p.autocap++
return r
}
// CountCaptures is a prescanner for deducing the slots used for
// captures by doing a partial tokenization of the pattern.
func (p *parser) countCaptures() error {
var ch rune
p.noteCaptureSlot(0, 0)
p.autocap = 1
for p.charsRight() > 0 {
pos := p.textpos()
ch = p.moveRightGetChar()
switch ch {
case '\\':
if p.charsRight() > 0 {
p.scanBackslash(true)
}
case '#':
if p.useOptionX() {
p.moveLeft()
p.scanBlank()
}
case '[':
p.scanCharSet(false, true)
case ')':
if !p.emptyOptionsStack() {
p.popOptions()
}
case '(':
if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
p.moveLeft()
p.scanBlank()
} else {
p.pushOptions()
if p.charsRight() > 0 && p.rightChar(0) == '?' {
// we have (?...
p.moveRight(1)
if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
// named group: (?<... or (?'...
p.moveRight(1)
ch = p.rightChar(0)
if ch != '0' && IsWordChar(ch) {
if ch >= '1' && ch <= '9' {
dec, err := p.scanDecimal()
if err != nil {
return err
}
p.noteCaptureSlot(dec, pos)
} else {
p.noteCaptureName(p.scanCapname(), pos)
}
}
} else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
// RE2-compat (?P<)
p.moveRight(2)
ch = p.rightChar(0)
if IsWordChar(ch) {
p.noteCaptureName(p.scanCapname(), pos)
}
} else {
// (?...
// get the options if it's an option construct (?cimsx-cimsx...)
p.scanOptions()
if p.charsRight() > 0 {
if p.rightChar(0) == ')' {
// (?cimsx-cimsx)
p.moveRight(1)
p.popKeepOptions()
} else if p.rightChar(0) == '(' {
// alternation construct: (?(foo)yes|no)
// ignore the next paren so we don't capture the condition
p.ignoreNextParen = true
// break from here so we don't reset ignoreNextParen
continue
}
}
}
} else {
if !p.useOptionN() && !p.ignoreNextParen {
p.noteCaptureSlot(p.consumeAutocap(), pos)
}
}
}
p.ignoreNextParen = false
}
}
p.assignNameSlots()
return nil
}
func (p *parser) reset(topopts RegexOptions) {
p.currentPos = 0
p.autocap = 1
p.ignoreNextParen = false
if len(p.optionsStack) > 0 {
p.optionsStack = p.optionsStack[:0]
}
p.options = topopts
p.stack = nil
}
func (p *parser) scanRegex() (*regexNode, error) {
ch := '@' // nonspecial ch, means at beginning
isQuant := false
p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
for p.charsRight() > 0 {
wasPrevQuantifier := isQuant
isQuant = false
if err := p.scanBlank(); err != nil {
return nil, err
}
startpos := p.textpos()
// move past all of the normal characters. We'll stop when we hit some kind of control character,
// or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
if p.useOptionX() {
for p.charsRight() > 0 {
ch = p.rightChar(0)
//UGLY: clean up, this is ugly
if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
break
}
p.moveRight(1)
}
} else {
for p.charsRight() > 0 {
ch = p.rightChar(0)
if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
break
}
p.moveRight(1)
}
}
endpos := p.textpos()
p.scanBlank()
if p.charsRight() == 0 {
ch = '!' // nonspecial, means at end
} else if ch = p.rightChar(0); isSpecial(ch) {
isQuant = isQuantifier(ch)
p.moveRight(1)
} else {
ch = ' ' // nonspecial, means at ordinary char
}
if startpos < endpos {
cchUnquantified := endpos - startpos
if isQuant {
cchUnquantified--
}
wasPrevQuantifier = false
if cchUnquantified > 0 {
p.addToConcatenate(startpos, cchUnquantified, false)
}
if isQuant {
p.addUnitOne(p.charAt(endpos - 1))
}
}
switch ch {
case '!':
goto BreakOuterScan
case ' ':
goto ContinueOuterScan
case '[':
cc, err := p.scanCharSet(p.useOptionI(), false)
if err != nil {
return nil, err
}
p.addUnitSet(cc)
case '(':
p.pushOptions()
if grouper, err := p.scanGroupOpen(); err != nil {
return nil, err
} else if grouper == nil {
p.popKeepOptions()
} else {
p.pushGroup()
p.startGroup(grouper)
}
continue
case '|':
p.addAlternate()
goto ContinueOuterScan
case ')':
if p.emptyStack() {
return nil, p.getErr(ErrUnexpectedParen)
}
if err := p.addGroup(); err != nil {
return nil, err
}
if err := p.popGroup(); err != nil {
return nil, err
}
p.popOptions()
if p.unit == nil {
goto ContinueOuterScan
}
case '\\':
n, err := p.scanBackslash(false)
if err != nil {
return nil, err
}
p.addUnitNode(n)
case '^':
if p.useOptionM() {
p.addUnitType(ntBol)
} else {
p.addUnitType(ntBeginning)
}
case '$':
if p.useOptionM() {
p.addUnitType(ntEol)
} else {
p.addUnitType(ntEndZ)
}
case '.':
if p.useOptionE() {
p.addUnitSet(ECMAAnyClass())
} else if p.useOptionS() {
p.addUnitSet(AnyClass())
} else {
p.addUnitNotone('\n')
}
case '{', '*', '+', '?':
if p.unit == nil {
if wasPrevQuantifier {
return nil, p.getErr(ErrInvalidRepeatOp)
} else {
return nil, p.getErr(ErrMissingRepeatArgument)
}
}
p.moveLeft()
default:
return nil, p.getErr(ErrInternalError)
}
if err := p.scanBlank(); err != nil {
return nil, err
}
if p.charsRight() > 0 {
isQuant = p.isTrueQuantifier()
}
if p.charsRight() == 0 || !isQuant {
//maintain odd C# assignment order -- not sure if required, could clean up?
p.addConcatenate()
goto ContinueOuterScan
}
ch = p.moveRightGetChar()
// Handle quantifiers
for p.unit != nil {
var min, max int
var lazy bool
switch ch {
case '*':
min = 0
max = math.MaxInt32
case '?':
min = 0
max = 1
case '+':
min = 1
max = math.MaxInt32
case '{':
{
var err error
startpos = p.textpos()
if min, err = p.scanDecimal(); err != nil {
return nil, err
}
max = min
if startpos < p.textpos() {
if p.charsRight() > 0 && p.rightChar(0) == ',' {
p.moveRight(1)
if p.charsRight() == 0 || p.rightChar(0) == '}' {
max = math.MaxInt32
} else {
if max, err = p.scanDecimal(); err != nil {
return nil, err
}
}
}
}
if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
p.addConcatenate()
p.textto(startpos - 1)
goto ContinueOuterScan
}
}
default:
return nil, p.getErr(ErrInternalError)
}
if err := p.scanBlank(); err != nil {
return nil, err
}
if p.charsRight() == 0 || p.rightChar(0) != '?' {
lazy = false
} else {
p.moveRight(1)
lazy = true
}
if min > max {
return nil, p.getErr(ErrInvalidRepeatSize)
}
p.addConcatenate3(lazy, min, max)
}
ContinueOuterScan:
}
BreakOuterScan:
;
if !p.emptyStack() {
return nil, p.getErr(ErrMissingParen)
}
if err := p.addGroup(); err != nil {
return nil, err
}
return p.unit, nil
}
/*
* Simple parsing for replacement patterns
*/
func (p *parser) scanReplacement() (*regexNode, error) {
var c, startpos int
p.concatenation = newRegexNode(ntConcatenate, p.options)
for {
c = p.charsRight()
if c == 0 {
break
}
startpos = p.textpos()
for c > 0 && p.rightChar(0) != '$' {
p.moveRight(1)
c--
}
p.addToConcatenate(startpos, p.textpos()-startpos, true)
if c > 0 {
if p.moveRightGetChar() == '$' {
n, err := p.scanDollar()
if err != nil {
return nil, err
}
p.addUnitNode(n)
}
p.addConcatenate()
}
}
return p.concatenation, nil
}
/*
* Scans $ patterns recognized within replacement patterns
*/
func (p *parser) scanDollar() (*regexNode, error) {
if p.charsRight() == 0 {
return newRegexNodeCh(ntOne, p.options, '$'), nil
}
ch := p.rightChar(0)
angled := false
backpos := p.textpos()
lastEndPos := backpos
// Note angle
if ch == '{' && p.charsRight() > 1 {
angled = true
p.moveRight(1)
ch = p.rightChar(0)
}
// Try to parse backreference: \1 or \{1} or \{cap}
if ch >= '0' && ch <= '9' {
if !angled && p.useOptionE() {
capnum := -1
newcapnum := int(ch - '0')
p.moveRight(1)
if p.isCaptureSlot(newcapnum) {
capnum = newcapnum
lastEndPos = p.textpos()
}
for p.charsRight() > 0 {
ch = p.rightChar(0)
if ch < '0' || ch > '9' {
break
}
digit := int(ch - '0')
if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
return nil, p.getErr(ErrCaptureGroupOutOfRange)
}
newcapnum = newcapnum*10 + digit
p.moveRight(1)
if p.isCaptureSlot(newcapnum) {
capnum = newcapnum
lastEndPos = p.textpos()
}
}
p.textto(lastEndPos)
if capnum >= 0 {
return newRegexNodeM(ntRef, p.options, capnum), nil
}
} else {
capnum, err := p.scanDecimal()
if err != nil {
return nil, err
}
if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
if p.isCaptureSlot(capnum) {
return newRegexNodeM(ntRef, p.options, capnum), nil
}
}
}
} else if angled && IsWordChar(ch) {
capname := p.scanCapname()
if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
if p.isCaptureName(capname) {
return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
}
}
} else if !angled {
capnum := 1
switch ch {
case '$':
p.moveRight(1)
return newRegexNodeCh(ntOne, p.options, '$'), nil
case '&':
capnum = 0
case '`':
capnum = replaceLeftPortion
case '\'':
capnum = replaceRightPortion
case '+':
capnum = replaceLastGroup
case '_':
capnum = replaceWholeString
}
if capnum != 1 {
p.moveRight(1)
return newRegexNodeM(ntRef, p.options, capnum), nil
}
}
// unrecognized $: literalize
p.textto(backpos)
return newRegexNodeCh(ntOne, p.options, '$'), nil
}
// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
// a RegexNode for the type of group scanned, or nil if the group
// simply changed options (?cimsx-cimsx) or was a comment (#...).
func (p *parser) scanGroupOpen() (*regexNode, error) {
var ch rune
var nt nodeType
var err error
close := '>'
start := p.textpos()
// just return a RegexNode if we have:
// 1. "(" followed by nothing
// 2. "(x" where x != ?
// 3. "(?)"
if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
if p.useOptionN() || p.ignoreNextParen {
p.ignoreNextParen = false
return newRegexNode(ntGroup, p.options), nil
}
return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
}
p.moveRight(1)
for {
if p.charsRight() == 0 {
break
}
switch ch = p.moveRightGetChar(); ch {
case ':':
nt = ntGroup
case '=':
p.options &= ^RightToLeft
nt = ntRequire
case '!':
p.options &= ^RightToLeft
nt = ntPrevent
case '>':
nt = ntGreedy
case '\'':
close = '\''
fallthrough
case '<':
if p.charsRight() == 0 {
goto BreakRecognize
}
switch ch = p.moveRightGetChar(); ch {
case '=':
if close == '\'' {
goto BreakRecognize
}
p.options |= RightToLeft
nt = ntRequire
case '!':
if close == '\'' {
goto BreakRecognize
}
p.options |= RightToLeft
nt = ntPrevent
default:
p.moveLeft()
capnum := -1
uncapnum := -1
proceed := false
// grab part before -
if ch >= '0' && ch <= '9' {
if capnum, err = p.scanDecimal(); err != nil {
return nil, err
}
if !p.isCaptureSlot(capnum) {
capnum = -1
}
// check if we have bogus characters after the number
if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
return nil, p.getErr(ErrInvalidGroupName)
}
if capnum == 0 {
return nil, p.getErr(ErrCapNumNotZero)
}
} else if IsWordChar(ch) {
capname := p.scanCapname()
if p.isCaptureName(capname) {
capnum = p.captureSlotFromName(capname)
}
// check if we have bogus character after the name
if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
return nil, p.getErr(ErrInvalidGroupName)
}
} else if ch == '-' {
proceed = true
} else {
// bad group name - starts with something other than a word character and isn't a number
return nil, p.getErr(ErrInvalidGroupName)
}
// grab part after - if any
if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
p.moveRight(1)
//no more chars left, no closing char, etc
if p.charsRight() == 0 {
return nil, p.getErr(ErrInvalidGroupName)
}
ch = p.rightChar(0)
if ch >= '0' && ch <= '9' {
if uncapnum, err = p.scanDecimal(); err != nil {
return nil, err
}
if !p.isCaptureSlot(uncapnum) {
return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
}
// check if we have bogus characters after the number
if p.charsRight() > 0 && p.rightChar(0) != close {
return nil, p.getErr(ErrInvalidGroupName)
}
} else if IsWordChar(ch) {
uncapname := p.scanCapname()
if !p.isCaptureName(uncapname) {
return nil, p.getErr(ErrUndefinedNameRef, uncapname)
}
uncapnum = p.captureSlotFromName(uncapname)
// check if we have bogus character after the name
if p.charsRight() > 0 && p.rightChar(0) != close {
return nil, p.getErr(ErrInvalidGroupName)
}
} else {
// bad group name - starts with something other than a word character and isn't a number
return nil, p.getErr(ErrInvalidGroupName)
}
}
// actually make the node
if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
}
goto BreakRecognize
}
case '(':
// alternation construct (?(...) | )
parenPos := p.textpos()
if p.charsRight() > 0 {
ch = p.rightChar(0)
// check if the alternation condition is a backref
if ch >= '0' && ch <= '9' {
var capnum int
if capnum, err = p.scanDecimal(); err != nil {
return nil, err
}
if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
if p.isCaptureSlot(capnum) {
return newRegexNodeM(ntTestref, p.options, capnum), nil
}
return nil, p.getErr(ErrUndefinedReference, capnum)
}
return nil, p.getErr(ErrMalformedReference, capnum)
} else if IsWordChar(ch) {
capname := p.scanCapname()
if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
}
}
}
// not a backref
nt = ntTestgroup
p.textto(parenPos - 1) // jump to the start of the parentheses
p.ignoreNextParen = true // but make sure we don't try to capture the insides
charsRight := p.charsRight()
if charsRight >= 3 && p.rightChar(1) == '?' {
rightchar2 := p.rightChar(2)
// disallow comments in the condition
if rightchar2 == '#' {
return nil, p.getErr(ErrAlternationCantHaveComment)
}
// disallow named capture group (?<..>..) in the condition
if rightchar2 == '\'' {
return nil, p.getErr(ErrAlternationCantCapture)
}
if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
return nil, p.getErr(ErrAlternationCantCapture)
}
}
case 'P':
if p.useRE2() {
// support for P<name> syntax
if p.charsRight() < 3 {
goto BreakRecognize
}
ch = p.moveRightGetChar()
if ch != '<' {
goto BreakRecognize
}
ch = p.moveRightGetChar()
p.moveLeft()
if IsWordChar(ch) {
capnum := -1
capname := p.scanCapname()
if p.isCaptureName(capname) {
capnum = p.captureSlotFromName(capname)
}
// check if we have bogus character after the name
if p.charsRight() > 0 && p.rightChar(0) != '>' {
return nil, p.getErr(ErrInvalidGroupName)
}
// actually make the node
if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
}
goto BreakRecognize
} else {
// bad group name - starts with something other than a word character and isn't a number
return nil, p.getErr(ErrInvalidGroupName)
}
}
// if we're not using RE2 compat mode then
// we just behave like normal
fallthrough
default:
p.moveLeft()
nt = ntGroup
// disallow options in the children of a testgroup node
if p.group.t != ntTestgroup {
p.scanOptions()
}
if p.charsRight() == 0 {
goto BreakRecognize
}
if ch = p.moveRightGetChar(); ch == ')' {
return nil, nil
}
if ch != ':' {
goto BreakRecognize
}
}
return newRegexNode(nt, p.options), nil
}
BreakRecognize:
// break Recognize comes here
return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
}
// scans backslash specials and basics
func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
if p.charsRight() == 0 {
return nil, p.getErr(ErrIllegalEndEscape)
}
switch ch := p.rightChar(0); ch {
case 'b', 'B', 'A', 'G', 'Z', 'z':
p.moveRight(1)
return newRegexNode(p.typeFromCode(ch), p.options), nil
case 'w':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
}
return newRegexNodeSet(ntSet, p.options, WordClass()), nil
case 'W':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
case 's':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
}
return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
case 'S':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
case 'd':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
}
return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
case 'D':
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
case 'p', 'P':
p.moveRight(1)
prop, err := p.parseProperty()
if err != nil {
return nil, err
}
cc := &CharSet{}
cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
if p.useOptionI() {
cc.addLowercase()
}
return newRegexNodeSet(ntSet, p.options, cc), nil
default:
return p.scanBasicBackslash(scanOnly)
}
}
// Scans \-style backreferences and character escapes
func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
if p.charsRight() == 0 {
return nil, p.getErr(ErrIllegalEndEscape)
}
angled := false
close := '\x00'
backpos := p.textpos()
ch := p.rightChar(0)
// allow \k<foo> instead of \<foo>, which is now deprecated
if ch == 'k' {
if p.charsRight() >= 2 {
p.moveRight(1)
ch = p.moveRightGetChar()
if ch == '<' || ch == '\'' {
angled = true
if ch == '\'' {
close = '\''
} else {
close = '>'
}
}
}
if !angled || p.charsRight() <= 0 {
return nil, p.getErr(ErrMalformedNameRef)
}
ch = p.rightChar(0)
} else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
angled = true
if ch == '\'' {
close = '\''
} else {
close = '>'
}
p.moveRight(1)
ch = p.rightChar(0)
}
// Try to parse backreference: \<1> or \<cap>
if angled && ch >= '0' && ch <= '9' {
capnum, err := p.scanDecimal()
if err != nil {
return nil, err
}
if p.charsRight() > 0 && p.moveRightGetChar() == close {
if p.isCaptureSlot(capnum) {
return newRegexNodeM(ntRef, p.options, capnum), nil
}
return nil, p.getErr(ErrUndefinedBackRef, capnum)
}
} else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
capnum, err := p.scanDecimal()
if err != nil {
return nil, err
}
if scanOnly {
return nil, nil
}
if p.useOptionE() || p.isCaptureSlot(capnum) {
return newRegexNodeM(ntRef, p.options, capnum), nil
}
if capnum <= 9 {
return nil, p.getErr(ErrUndefinedBackRef, capnum)
}
} else if angled && IsWordChar(ch) {
capname := p.scanCapname()
if p.charsRight() > 0 && p.moveRightGetChar() == close {
if p.isCaptureName(capname) {
return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
}
return nil, p.getErr(ErrUndefinedNameRef, capname)
}
}
// Not backreference: must be char code
p.textto(backpos)
ch, err := p.scanCharEscape()
if err != nil {
return nil, err
}
if p.useOptionI() {
ch = unicode.ToLower(ch)
}
return newRegexNodeCh(ntOne, p.options, ch), nil
}
// Scans X for \p{X} or \P{X}
func (p *parser) parseProperty() (string, error) {
if p.charsRight() < 3 {
return "", p.getErr(ErrIncompleteSlashP)
}
ch := p.moveRightGetChar()
if ch != '{' {
return "", p.getErr(ErrMalformedSlashP)
}
startpos := p.textpos()
for p.charsRight() > 0 {
ch = p.moveRightGetChar()
if !(IsWordChar(ch) || ch == '-') {
p.moveLeft()
break
}
}
capname := string(p.pattern[startpos:p.textpos()])
if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
return "", p.getErr(ErrIncompleteSlashP)
}
if !isValidUnicodeCat(capname) {
return "", p.getErr(ErrUnknownSlashP, capname)
}
return capname, nil
}
// Returns ReNode type for zero-length assertions with a \ code.
func (p *parser) typeFromCode(ch rune) nodeType {
switch ch {
case 'b':
if p.useOptionE() {
return ntECMABoundary
}
return ntBoundary
case 'B':
if p.useOptionE() {
return ntNonECMABoundary
}
return ntNonboundary
case 'A':
return ntBeginning
case 'G':
return ntStart
case 'Z':
return ntEndZ
case 'z':
return ntEnd
default:
return ntNothing
}
}
// Scans whitespace or x-mode comments.
func (p *parser) scanBlank() error {
if p.useOptionX() {
for {
for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
p.moveRight(1)
}
if p.charsRight() == 0 {
break
}
if p.rightChar(0) == '#' {
for p.charsRight() > 0 && p.rightChar(0) != '\n' {
p.moveRight(1)
}
} else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
p.rightChar(1) == '?' && p.rightChar(0) == '(' {
for p.charsRight() > 0 && p.rightChar(0) != ')' {
p.moveRight(1)
}
if p.charsRight() == 0 {
return p.getErr(ErrUnterminatedComment)
}
p.moveRight(1)
} else {
break
}
}
} else {
for {
if p.charsRight() < 3 || p.rightChar(2) != '#' ||
p.rightChar(1) != '?' || p.rightChar(0) != '(' {
return nil
}
for p.charsRight() > 0 && p.rightChar(0) != ')' {
p.moveRight(1)
}
if p.charsRight() == 0 {
return p.getErr(ErrUnterminatedComment)
}
p.moveRight(1)
}
}
return nil
}
func (p *parser) scanCapname() string {
startpos := p.textpos()
for p.charsRight() > 0 {
if !IsWordChar(p.moveRightGetChar()) {
p.moveLeft()
break
}
}
return string(p.pattern[startpos:p.textpos()])
}
//Scans contents of [] (not including []'s), and converts to a set.
func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
ch := '\x00'
chPrev := '\x00'
inRange := false
firstChar := true
closed := false
var cc *CharSet
if !scanOnly {
cc = &CharSet{}
}
if p.charsRight() > 0 && p.rightChar(0) == '^' {
p.moveRight(1)
if !scanOnly {
cc.negate = true
}
}
for ; p.charsRight() > 0; firstChar = false {
fTranslatedChar := false
ch = p.moveRightGetChar()
if ch == ']' {
if !firstChar {
closed = true
break
} else if p.useOptionE() {
if !scanOnly {
cc.addRanges(NoneClass().ranges)
}
closed = true
break
}
} else if ch == '\\' && p.charsRight() > 0 {
switch ch = p.moveRightGetChar(); ch {
case 'D', 'd':
if !scanOnly {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
}
continue
case 'S', 's':
if !scanOnly {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
cc.addSpace(p.useOptionE(), ch == 'S')
}
continue
case 'W', 'w':
if !scanOnly {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
cc.addWord(p.useOptionE(), ch == 'W')
}
continue
case 'p', 'P':
if !scanOnly {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
prop, err := p.parseProperty()
if err != nil {
return nil, err
}
cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
} else {
p.parseProperty()
}
continue
case '-':
if !scanOnly {
cc.addRange(ch, ch)
}
continue
default:
p.moveLeft()
var err error
ch, err = p.scanCharEscape() // non-literal character
if err != nil {
return nil, err
}
fTranslatedChar = true
break // this break will only break out of the switch
}
} else if ch == '[' {
// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
// It currently doesn't do anything other than skip the whole thing!
if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
savePos := p.textpos()
p.moveRight(1)
negate := false
if p.charsRight() > 1 && p.rightChar(0) == '^' {
negate = true
p.moveRight(1)
}
nm := p.scanCapname() // snag the name
if !scanOnly && p.useRE2() {
// look up the name since these are valid for RE2
// add the group based on the name
if ok := cc.addNamedASCII(nm, negate); !ok {
return nil, p.getErr(ErrInvalidCharRange)
}
}
if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
p.textto(savePos)
} else if p.useRE2() {
// move on
continue
}
}
}
if inRange {
inRange = false
if !scanOnly {
if ch == '[' && !fTranslatedChar && !firstChar {
// We thought we were in a range, but we're actually starting a subtraction.
// In that case, we'll add chPrev to our char class, skip the opening [, and
// scan the new character class recursively.
cc.addChar(chPrev)
sub, err := p.scanCharSet(caseInsensitive, false)
if err != nil {
return nil, err
}
cc.addSubtraction(sub)
if p.charsRight() > 0 && p.rightChar(0) != ']' {
return nil, p.getErr(ErrSubtractionMustBeLast)
}
} else {
// a regular range, like a-z
if chPrev > ch {
return nil, p.getErr(ErrReversedCharRange)
}
cc.addRange(chPrev, ch)
}
}
} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
// this could be the start of a range
chPrev = ch
inRange = true
p.moveRight(1)
} else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
// we aren't in a range, and now there is a subtraction. Usually this happens
// only when a subtraction follows a range, like [a-z-[b]]
if !scanOnly {
p.moveRight(1)
sub, err := p.scanCharSet(caseInsensitive, false)
if err != nil {
return nil, err
}
cc.addSubtraction(sub)
if p.charsRight() > 0 && p.rightChar(0) != ']' {
return nil, p.getErr(ErrSubtractionMustBeLast)
}
} else {
p.moveRight(1)
p.scanCharSet(caseInsensitive, true)
}
} else {
if !scanOnly {
cc.addRange(ch, ch)
}
}
}
if !closed {
return nil, p.getErr(ErrUnterminatedBracket)
}
if !scanOnly && caseInsensitive {
cc.addLowercase()
}
return cc, nil
}
// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
func (p *parser) scanDecimal() (int, error) {
i := 0
var d int
for p.charsRight() > 0 {
d = int(p.rightChar(0) - '0')
if d < 0 || d > 9 {
break
}
p.moveRight(1)
if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
return 0, p.getErr(ErrCaptureGroupOutOfRange)
}
i *= 10
i += d
}
return int(i), nil
}
// Returns true for options allowed only at the top level
func isOnlyTopOption(option RegexOptions) bool {
return option == RightToLeft || option == ECMAScript || option == RE2
}
// Scans cimsx-cimsx option string, stops at the first unrecognized char.
func (p *parser) scanOptions() {
for off := false; p.charsRight() > 0; p.moveRight(1) {
ch := p.rightChar(0)
if ch == '-' {
off = true
} else if ch == '+' {
off = false
} else {
option := optionFromCode(ch)
if option == 0 || isOnlyTopOption(option) {
return
}
if off {
p.options &= ^option
} else {
p.options |= option
}
}
}
}
// Scans \ code for escape codes that map to single unicode chars.
func (p *parser) scanCharEscape() (rune, error) {
ch := p.moveRightGetChar()
if ch >= '0' && ch <= '7' {
p.moveLeft()
return p.scanOctal(), nil
}
switch ch {
case 'x':
// support for \x{HEX} syntax from Perl and PCRE
if p.charsRight() > 0 && p.rightChar(0) == '{' {
p.moveRight(1)
return p.scanHexUntilBrace()
}
return p.scanHex(2)
case 'u':
return p.scanHex(4)
case 'a':
return '\u0007', nil
case 'b':
return '\b', nil
case 'e':
return '\u001B', nil
case 'f':
return '\f', nil
case 'n':
return '\n', nil
case 'r':
return '\r', nil
case 't':
return '\t', nil
case 'v':
return '\u000B', nil
case 'c':
return p.scanControl()
default:
if !p.useOptionE() && IsWordChar(ch) {
return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
}
return ch, nil
}
}
// Grabs and converts an ascii control character
func (p *parser) scanControl() (rune, error) {
if p.charsRight() <= 0 {
return 0, p.getErr(ErrMissingControl)
}
ch := p.moveRightGetChar()
// \ca interpreted as \cA
if ch >= 'a' && ch <= 'z' {
ch = (ch - ('a' - 'A'))
}
ch = (ch - '@')
if ch >= 0 && ch < ' ' {
return ch, nil
}
return 0, p.getErr(ErrUnrecognizedControl)
}
// Scan hex digits until we hit a closing brace.
// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
func (p *parser) scanHexUntilBrace() (rune, error) {
// PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
// so we can enforce that
i := 0
hasContent := false
for p.charsRight() > 0 {
ch := p.moveRightGetChar()
if ch == '}' {
// hit our close brace, we're done here
// prevent \x{}
if !hasContent {
return 0, p.getErr(ErrTooFewHex)
}
return rune(i), nil
}
hasContent = true
// no brace needs to be hex digit
d := hexDigit(ch)
if d < 0 {
return 0, p.getErr(ErrMissingBrace)
}
i *= 0x10
i += d
if i > unicode.MaxRune {
return 0, p.getErr(ErrInvalidHex)
}
}
// we only make it here if we run out of digits without finding the brace
return 0, p.getErr(ErrMissingBrace)
}
// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
func (p *parser) scanHex(c int) (rune, error) {
i := 0
if p.charsRight() >= c {
for c > 0 {
d := hexDigit(p.moveRightGetChar())
if d < 0 {
break
}
i *= 0x10
i += d
c--
}
}
if c > 0 {
return 0, p.getErr(ErrTooFewHex)
}
return rune(i), nil
}
// Returns n <= 0xF for a hex digit.
func hexDigit(ch rune) int {
if d := uint(ch - '0'); d <= 9 {
return int(d)
}
if d := uint(ch - 'a'); d <= 5 {
return int(d + 0xa)
}
if d := uint(ch - 'A'); d <= 5 {
return int(d + 0xa)
}
return -1
}
// Scans up to three octal digits (stops before exceeding 0377).
func (p *parser) scanOctal() rune {
// Consume octal chars only up to 3 digits and value 0377
c := 3
if c > p.charsRight() {
c = p.charsRight()
}
//we know the first char is good because the caller had to check
i := 0
d := int(p.rightChar(0) - '0')
for c > 0 && d <= 7 {
i *= 8
i += d
if p.useOptionE() && i >= 0x20 {
break
}
c--
p.moveRight(1)
if !p.rightMost() {
d = int(p.rightChar(0) - '0')
}
}
// Octal codes only go up to 255. Any larger and the behavior that Perl follows
// is simply to truncate the high bits.
i &= 0xFF
return rune(i)
}
// Returns the current parsing position.
func (p *parser) textpos() int {
return p.currentPos
}
// Zaps to a specific parsing position.
func (p *parser) textto(pos int) {
p.currentPos = pos
}
// Returns the char at the right of the current parsing position and advances to the right.
func (p *parser) moveRightGetChar() rune {
ch := p.pattern[p.currentPos]
p.currentPos++
return ch
}
// Moves the current position to the right.
func (p *parser) moveRight(i int) {
// default would be 1
p.currentPos += i
}
// Moves the current parsing position one to the left.
func (p *parser) moveLeft() {
p.currentPos--
}
// Returns the char left of the current parsing position.
func (p *parser) charAt(i int) rune {
return p.pattern[i]
}
// Returns the char i chars right of the current parsing position.
func (p *parser) rightChar(i int) rune {
// default would be 0
return p.pattern[p.currentPos+i]
}
// Number of characters to the right of the current parsing position.
func (p *parser) charsRight() int {
return len(p.pattern) - p.currentPos
}
func (p *parser) rightMost() bool {
return p.currentPos == len(p.pattern)
}
// Looks up the slot number for a given name
func (p *parser) captureSlotFromName(capname string) int {
return p.capnames[capname]
}
// True if the capture slot was noted
func (p *parser) isCaptureSlot(i int) bool {
if p.caps != nil {
_, ok := p.caps[i]
return ok
}
return (i >= 0 && i < p.capsize)
}
// Looks up the slot number for a given name
func (p *parser) isCaptureName(capname string) bool {
if p.capnames == nil {
return false
}
_, ok := p.capnames[capname]
return ok
}
// option shortcuts
// True if N option disabling '(' autocapture is on.
func (p *parser) useOptionN() bool {
return (p.options & ExplicitCapture) != 0
}
// True if I option enabling case-insensitivity is on.
func (p *parser) useOptionI() bool {
return (p.options & IgnoreCase) != 0
}
// True if M option altering meaning of $ and ^ is on.
func (p *parser) useOptionM() bool {
return (p.options & Multiline) != 0
}
// True if S option altering meaning of . is on.
func (p *parser) useOptionS() bool {
return (p.options & Singleline) != 0
}
// True if X option enabling whitespace/comment mode is on.
func (p *parser) useOptionX() bool {
return (p.options & IgnorePatternWhitespace) != 0
}
// True if E option enabling ECMAScript behavior on.
func (p *parser) useOptionE() bool {
return (p.options & ECMAScript) != 0
}
// true to use RE2 compatibility parsing behavior.
func (p *parser) useRE2() bool {
return (p.options & RE2) != 0
}
// True if options stack is empty.
func (p *parser) emptyOptionsStack() bool {
return len(p.optionsStack) == 0
}
// Finish the current quantifiable (when a quantifier is not found or is not possible)
func (p *parser) addConcatenate() {
// The first (| inside a Testgroup group goes directly to the group
p.concatenation.addChild(p.unit)
p.unit = nil
}
// Finish the current quantifiable (when a quantifier is found)
func (p *parser) addConcatenate3(lazy bool, min, max int) {
p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
p.unit = nil
}
// Sets the current unit to a single char node
func (p *parser) addUnitOne(ch rune) {
if p.useOptionI() {
ch = unicode.ToLower(ch)
}
p.unit = newRegexNodeCh(ntOne, p.options, ch)
}
// Sets the current unit to a single inverse-char node
func (p *parser) addUnitNotone(ch rune) {
if p.useOptionI() {
ch = unicode.ToLower(ch)
}
p.unit = newRegexNodeCh(ntNotone, p.options, ch)
}
// Sets the current unit to a single set node
func (p *parser) addUnitSet(set *CharSet) {
p.unit = newRegexNodeSet(ntSet, p.options, set)
}
// Sets the current unit to a subtree
func (p *parser) addUnitNode(node *regexNode) {
p.unit = node
}
// Sets the current unit to an assertion of the specified type
func (p *parser) addUnitType(t nodeType) {
p.unit = newRegexNode(t, p.options)
}
// Finish the current group (in response to a ')' or end)
func (p *parser) addGroup() error {
if p.group.t == ntTestgroup || p.group.t == ntTestref {
p.group.addChild(p.concatenation.reverseLeft())
if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
return p.getErr(ErrTooManyAlternates)
}
} else {
p.alternation.addChild(p.concatenation.reverseLeft())
p.group.addChild(p.alternation)
}
p.unit = p.group
return nil
}
// Pops the option stack, but keeps the current options unchanged.
func (p *parser) popKeepOptions() {
lastIdx := len(p.optionsStack) - 1
p.optionsStack = p.optionsStack[:lastIdx]
}
// Recalls options from the stack.
func (p *parser) popOptions() {
lastIdx := len(p.optionsStack) - 1
// get the last item on the stack and then remove it by reslicing
p.options = p.optionsStack[lastIdx]
p.optionsStack = p.optionsStack[:lastIdx]
}
// Saves options on a stack.
func (p *parser) pushOptions() {
p.optionsStack = append(p.optionsStack, p.options)
}
// Add a string to the last concatenate.
func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
var node *regexNode
if cch == 0 {
return
}
if cch > 1 {
str := p.pattern[pos : pos+cch]
if p.useOptionI() && !isReplacement {
// We do the ToLower character by character for consistency. With surrogate chars, doing
// a ToLower on the entire string could actually change the surrogate pair. This is more correct
// linguistically, but since Regex doesn't support surrogates, it's more important to be
// consistent.
for i := 0; i < len(str); i++ {
str[i] = unicode.ToLower(str[i])
}
}
node = newRegexNodeStr(ntMulti, p.options, str)
} else {
ch := p.charAt(pos)
if p.useOptionI() && !isReplacement {
ch = unicode.ToLower(ch)
}
node = newRegexNodeCh(ntOne, p.options, ch)
}
p.concatenation.addChild(node)
}
// Push the parser state (in response to an open paren)
func (p *parser) pushGroup() {
p.group.next = p.stack
p.alternation.next = p.group
p.concatenation.next = p.alternation
p.stack = p.concatenation
}
// Remember the pushed state (in response to a ')')
func (p *parser) popGroup() error {
p.concatenation = p.stack
p.alternation = p.concatenation.next
p.group = p.alternation.next
p.stack = p.group.next
// The first () inside a Testgroup group goes directly to the group
if p.group.t == ntTestgroup && len(p.group.children) == 0 {
if p.unit == nil {
return p.getErr(ErrConditionalExpression)
}
p.group.addChild(p.unit)
p.unit = nil
}
return nil
}
// True if the group stack is empty.
func (p *parser) emptyStack() bool {
return p.stack == nil
}
// Start a new round for the parser state (in response to an open paren or string start)
func (p *parser) startGroup(openGroup *regexNode) {
p.group = openGroup
p.alternation = newRegexNode(ntAlternate, p.options)
p.concatenation = newRegexNode(ntConcatenate, p.options)
}
// Finish the current concatenation (in response to a |)
func (p *parser) addAlternate() {
// The | parts inside a Testgroup group go directly to the group
if p.group.t == ntTestgroup || p.group.t == ntTestref {
p.group.addChild(p.concatenation.reverseLeft())
} else {
p.alternation.addChild(p.concatenation.reverseLeft())
}
p.concatenation = newRegexNode(ntConcatenate, p.options)
}
// For categorizing ascii characters.
const (
Q byte = 5 // quantifier
S = 4 // ordinary stopper
Z = 3 // ScanBlank stopper
X = 2 // whitespace
E = 1 // should be escaped
)
var _category = []byte{
//01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
//@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
//'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
}
func isSpace(ch rune) bool {
return (ch <= ' ' && _category[ch] == X)
}
// Returns true for those characters that terminate a string of ordinary chars.
func isSpecial(ch rune) bool {
return (ch <= '|' && _category[ch] >= S)
}
// Returns true for those characters that terminate a string of ordinary chars.
func isStopperX(ch rune) bool {
return (ch <= '|' && _category[ch] >= X)
}
// Returns true for those characters that begin a quantifier.
func isQuantifier(ch rune) bool {
return (ch <= '{' && _category[ch] >= Q)
}
func (p *parser) isTrueQuantifier() bool {
nChars := p.charsRight()
if nChars == 0 {
return false
}
startpos := p.textpos()
ch := p.charAt(startpos)
if ch != '{' {
return ch <= '{' && _category[ch] >= Q
}
//UGLY: this is ugly -- the original code was ugly too
pos := startpos
for {
nChars--
if nChars <= 0 {
break
}
pos++
ch = p.charAt(pos)
if ch < '0' || ch > '9' {
break
}
}
if nChars == 0 || pos-startpos == 1 {
return false
}
if ch == '}' {
return true
}
if ch != ',' {
return false
}
for {
nChars--
if nChars <= 0 {
break
}
pos++
ch = p.charAt(pos)
if ch < '0' || ch > '9' {
break
}
}
return nChars > 0 && ch == '}'
}