344 lines
		
	
	
		
			7.4 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			344 lines
		
	
	
		
			7.4 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| //  Copyright (c) 2017 Couchbase, Inc.
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| // 		http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| package regexp
 | |
| 
 | |
| import (
 | |
| 	"regexp/syntax"
 | |
| 	"unicode"
 | |
| 
 | |
| 	unicode_utf8 "unicode/utf8"
 | |
| 
 | |
| 	"github.com/couchbase/vellum/utf8"
 | |
| )
 | |
| 
 | |
| type compiler struct {
 | |
| 	sizeLimit uint
 | |
| 	insts     prog
 | |
| 	instsPool []inst
 | |
| 
 | |
| 	sequences  utf8.Sequences
 | |
| 	rangeStack utf8.RangeStack
 | |
| 	startBytes []byte
 | |
| 	endBytes   []byte
 | |
| }
 | |
| 
 | |
| func newCompiler(sizeLimit uint) *compiler {
 | |
| 	return &compiler{
 | |
| 		sizeLimit:  sizeLimit,
 | |
| 		startBytes: make([]byte, unicode_utf8.UTFMax),
 | |
| 		endBytes:   make([]byte, unicode_utf8.UTFMax),
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (c *compiler) compile(ast *syntax.Regexp) (prog, error) {
 | |
| 	err := c.c(ast)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	inst := c.allocInst()
 | |
| 	inst.op = OpMatch
 | |
| 	c.insts = append(c.insts, inst)
 | |
| 	return c.insts, nil
 | |
| }
 | |
| 
 | |
| func (c *compiler) c(ast *syntax.Regexp) (err error) {
 | |
| 	if ast.Flags&syntax.NonGreedy > 1 {
 | |
| 		return ErrNoLazy
 | |
| 	}
 | |
| 
 | |
| 	switch ast.Op {
 | |
| 	case syntax.OpEndLine, syntax.OpBeginLine,
 | |
| 		syntax.OpBeginText, syntax.OpEndText:
 | |
| 		return ErrNoEmpty
 | |
| 	case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
 | |
| 		return ErrNoWordBoundary
 | |
| 	case syntax.OpEmptyMatch:
 | |
| 		return nil
 | |
| 	case syntax.OpLiteral:
 | |
| 		for _, r := range ast.Rune {
 | |
| 			if ast.Flags&syntax.FoldCase > 0 {
 | |
| 				next := syntax.Regexp{
 | |
| 					Op:    syntax.OpCharClass,
 | |
| 					Flags: ast.Flags & syntax.FoldCase,
 | |
| 					Rune0: [2]rune{r, r},
 | |
| 				}
 | |
| 				next.Rune = next.Rune0[0:2]
 | |
| 				// try to find more folded runes
 | |
| 				for r1 := unicode.SimpleFold(r); r1 != r; r1 = unicode.SimpleFold(r1) {
 | |
| 					next.Rune = append(next.Rune, r1, r1)
 | |
| 				}
 | |
| 				err = c.c(&next)
 | |
| 				if err != nil {
 | |
| 					return err
 | |
| 				}
 | |
| 			} else {
 | |
| 				c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
 | |
| 					r, r, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
 | |
| 				if err != nil {
 | |
| 					return err
 | |
| 				}
 | |
| 				for _, seq := range c.sequences {
 | |
| 					c.compileUtf8Ranges(seq)
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	case syntax.OpAnyChar:
 | |
| 		next := syntax.Regexp{
 | |
| 			Op:    syntax.OpCharClass,
 | |
| 			Flags: ast.Flags & syntax.FoldCase,
 | |
| 			Rune0: [2]rune{0, unicode.MaxRune},
 | |
| 		}
 | |
| 		next.Rune = next.Rune0[:2]
 | |
| 		return c.c(&next)
 | |
| 	case syntax.OpAnyCharNotNL:
 | |
| 		next := syntax.Regexp{
 | |
| 			Op:    syntax.OpCharClass,
 | |
| 			Flags: ast.Flags & syntax.FoldCase,
 | |
| 			Rune:  []rune{0, 0x09, 0x0B, unicode.MaxRune},
 | |
| 		}
 | |
| 		return c.c(&next)
 | |
| 	case syntax.OpCharClass:
 | |
| 		return c.compileClass(ast)
 | |
| 	case syntax.OpCapture:
 | |
| 		return c.c(ast.Sub[0])
 | |
| 	case syntax.OpConcat:
 | |
| 		for _, sub := range ast.Sub {
 | |
| 			err := c.c(sub)
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 		}
 | |
| 		return nil
 | |
| 	case syntax.OpAlternate:
 | |
| 		if len(ast.Sub) == 0 {
 | |
| 			return nil
 | |
| 		}
 | |
| 		jmpsToEnd := make([]uint, 0, len(ast.Sub)-1)
 | |
| 		// does not handle last entry
 | |
| 		for i := 0; i < len(ast.Sub)-1; i++ {
 | |
| 			sub := ast.Sub[i]
 | |
| 			split := c.emptySplit()
 | |
| 			j1 := c.top()
 | |
| 			err := c.c(sub)
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			jmpsToEnd = append(jmpsToEnd, c.emptyJump())
 | |
| 			j2 := c.top()
 | |
| 			c.setSplit(split, j1, j2)
 | |
| 		}
 | |
| 		// handle last entry
 | |
| 		err := c.c(ast.Sub[len(ast.Sub)-1])
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		end := uint(len(c.insts))
 | |
| 		for _, jmpToEnd := range jmpsToEnd {
 | |
| 			c.setJump(jmpToEnd, end)
 | |
| 		}
 | |
| 	case syntax.OpQuest:
 | |
| 		split := c.emptySplit()
 | |
| 		j1 := c.top()
 | |
| 		err := c.c(ast.Sub[0])
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		j2 := c.top()
 | |
| 		c.setSplit(split, j1, j2)
 | |
| 
 | |
| 	case syntax.OpStar:
 | |
| 		j1 := c.top()
 | |
| 		split := c.emptySplit()
 | |
| 		j2 := c.top()
 | |
| 		err := c.c(ast.Sub[0])
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		jmp := c.emptyJump()
 | |
| 		j3 := uint(len(c.insts))
 | |
| 
 | |
| 		c.setJump(jmp, j1)
 | |
| 		c.setSplit(split, j2, j3)
 | |
| 
 | |
| 	case syntax.OpPlus:
 | |
| 		j1 := c.top()
 | |
| 		err := c.c(ast.Sub[0])
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		split := c.emptySplit()
 | |
| 		j2 := c.top()
 | |
| 		c.setSplit(split, j1, j2)
 | |
| 
 | |
| 	case syntax.OpRepeat:
 | |
| 		if ast.Max == -1 {
 | |
| 			for i := 0; i < ast.Min; i++ {
 | |
| 				err := c.c(ast.Sub[0])
 | |
| 				if err != nil {
 | |
| 					return err
 | |
| 				}
 | |
| 			}
 | |
| 			next := syntax.Regexp{
 | |
| 				Op:    syntax.OpStar,
 | |
| 				Flags: ast.Flags,
 | |
| 				Sub:   ast.Sub,
 | |
| 				Sub0:  ast.Sub0,
 | |
| 				Rune:  ast.Rune,
 | |
| 				Rune0: ast.Rune0,
 | |
| 			}
 | |
| 			return c.c(&next)
 | |
| 		}
 | |
| 		for i := 0; i < ast.Min; i++ {
 | |
| 			err := c.c(ast.Sub[0])
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 		}
 | |
| 		splits := make([]uint, 0, ast.Max-ast.Min)
 | |
| 		starts := make([]uint, 0, ast.Max-ast.Min)
 | |
| 		for i := ast.Min; i < ast.Max; i++ {
 | |
| 			splits = append(splits, c.emptySplit())
 | |
| 			starts = append(starts, uint(len(c.insts)))
 | |
| 			err := c.c(ast.Sub[0])
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 		}
 | |
| 		end := uint(len(c.insts))
 | |
| 		for i := 0; i < len(splits); i++ {
 | |
| 			c.setSplit(splits[i], starts[i], end)
 | |
| 		}
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	return c.checkSize()
 | |
| }
 | |
| 
 | |
| func (c *compiler) checkSize() error {
 | |
| 	if uint(len(c.insts)*instSize) > c.sizeLimit {
 | |
| 		return ErrCompiledTooBig
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (c *compiler) compileClass(ast *syntax.Regexp) error {
 | |
| 	if len(ast.Rune) == 0 {
 | |
| 		return nil
 | |
| 	}
 | |
| 	jmps := make([]uint, 0, len(ast.Rune)-2)
 | |
| 	// does not do last pair
 | |
| 	for i := 0; i < len(ast.Rune)-2; i += 2 {
 | |
| 		rstart := ast.Rune[i]
 | |
| 		rend := ast.Rune[i+1]
 | |
| 
 | |
| 		split := c.emptySplit()
 | |
| 		j1 := c.top()
 | |
| 		err := c.compileClassRange(rstart, rend)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		jmps = append(jmps, c.emptyJump())
 | |
| 		j2 := c.top()
 | |
| 		c.setSplit(split, j1, j2)
 | |
| 	}
 | |
| 	// handle last pair
 | |
| 	rstart := ast.Rune[len(ast.Rune)-2]
 | |
| 	rend := ast.Rune[len(ast.Rune)-1]
 | |
| 	err := c.compileClassRange(rstart, rend)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	end := c.top()
 | |
| 	for _, jmp := range jmps {
 | |
| 		c.setJump(jmp, end)
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (c *compiler) compileClassRange(startR, endR rune) (err error) {
 | |
| 	c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
 | |
| 		startR, endR, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	jmps := make([]uint, 0, len(c.sequences)-1)
 | |
| 	// does not do last entry
 | |
| 	for i := 0; i < len(c.sequences)-1; i++ {
 | |
| 		seq := c.sequences[i]
 | |
| 		split := c.emptySplit()
 | |
| 		j1 := c.top()
 | |
| 		c.compileUtf8Ranges(seq)
 | |
| 		jmps = append(jmps, c.emptyJump())
 | |
| 		j2 := c.top()
 | |
| 		c.setSplit(split, j1, j2)
 | |
| 	}
 | |
| 	// handle last entry
 | |
| 	c.compileUtf8Ranges(c.sequences[len(c.sequences)-1])
 | |
| 	end := c.top()
 | |
| 	for _, jmp := range jmps {
 | |
| 		c.setJump(jmp, end)
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) {
 | |
| 	for _, r := range seq {
 | |
| 		inst := c.allocInst()
 | |
| 		inst.op = OpRange
 | |
| 		inst.rangeStart = r.Start
 | |
| 		inst.rangeEnd = r.End
 | |
| 		c.insts = append(c.insts, inst)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (c *compiler) emptySplit() uint {
 | |
| 	inst := c.allocInst()
 | |
| 	inst.op = OpSplit
 | |
| 	c.insts = append(c.insts, inst)
 | |
| 	return c.top() - 1
 | |
| }
 | |
| 
 | |
| func (c *compiler) emptyJump() uint {
 | |
| 	inst := c.allocInst()
 | |
| 	inst.op = OpJmp
 | |
| 	c.insts = append(c.insts, inst)
 | |
| 	return c.top() - 1
 | |
| }
 | |
| 
 | |
| func (c *compiler) setSplit(i, pc1, pc2 uint) {
 | |
| 	split := c.insts[i]
 | |
| 	split.splitA = pc1
 | |
| 	split.splitB = pc2
 | |
| }
 | |
| 
 | |
| func (c *compiler) setJump(i, pc uint) {
 | |
| 	jmp := c.insts[i]
 | |
| 	jmp.to = pc
 | |
| }
 | |
| 
 | |
| func (c *compiler) top() uint {
 | |
| 	return uint(len(c.insts))
 | |
| }
 | |
| 
 | |
| func (c *compiler) allocInst() *inst {
 | |
| 	if len(c.instsPool) <= 0 {
 | |
| 		c.instsPool = make([]inst, 16)
 | |
| 	}
 | |
| 	inst := &c.instsPool[0]
 | |
| 	c.instsPool = c.instsPool[1:]
 | |
| 	return inst
 | |
| }
 |