189 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			189 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| //  Copyright (c) 2017 Couchbase, Inc.
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| // 		http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| package regexp
 | |
| 
 | |
| import (
 | |
| 	"encoding/binary"
 | |
| 	"fmt"
 | |
| )
 | |
| 
 | |
| // StateLimit is the maximum number of states allowed
 | |
| const StateLimit = 10000
 | |
| 
 | |
| // ErrTooManyStates is returned if you attempt to build a Levenshtein
 | |
| // automaton which requries too many states.
 | |
| var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states",
 | |
| 	StateLimit)
 | |
| 
 | |
| type dfaBuilder struct {
 | |
| 	dfa    *dfa
 | |
| 	cache  map[string]int
 | |
| 	keyBuf []byte
 | |
| }
 | |
| 
 | |
| func newDfaBuilder(insts prog) *dfaBuilder {
 | |
| 	d := &dfaBuilder{
 | |
| 		dfa: &dfa{
 | |
| 			insts:  insts,
 | |
| 			states: make([]*state, 0, 16),
 | |
| 		},
 | |
| 		cache: make(map[string]int, 1024),
 | |
| 	}
 | |
| 	// add 0 state that is invalid
 | |
| 	d.dfa.states = append(d.dfa.states, &state{
 | |
| 		next:  make([]int, 256),
 | |
| 		match: false,
 | |
| 	})
 | |
| 	return d
 | |
| }
 | |
| 
 | |
| func (d *dfaBuilder) build() (*dfa, error) {
 | |
| 	cur := newSparseSet(uint(len(d.dfa.insts)))
 | |
| 	next := newSparseSet(uint(len(d.dfa.insts)))
 | |
| 
 | |
| 	d.dfa.add(cur, 0)
 | |
| 	states := intStack{d.cachedState(cur)}
 | |
| 	seen := make(map[int]struct{})
 | |
| 	var s int
 | |
| 	states, s = states.Pop()
 | |
| 	for s != 0 {
 | |
| 		for b := 0; b < 256; b++ {
 | |
| 			ns := d.runState(cur, next, s, byte(b))
 | |
| 			if ns != 0 {
 | |
| 				if _, ok := seen[ns]; !ok {
 | |
| 					seen[ns] = struct{}{}
 | |
| 					states = states.Push(ns)
 | |
| 				}
 | |
| 			}
 | |
| 			if len(d.dfa.states) > StateLimit {
 | |
| 				return nil, ErrTooManyStates
 | |
| 			}
 | |
| 		}
 | |
| 		states, s = states.Pop()
 | |
| 	}
 | |
| 	return d.dfa, nil
 | |
| }
 | |
| 
 | |
| func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte) int {
 | |
| 	cur.Clear()
 | |
| 	for _, ip := range d.dfa.states[state].insts {
 | |
| 		cur.Add(ip)
 | |
| 	}
 | |
| 	d.dfa.run(cur, next, b)
 | |
| 	nextState := d.cachedState(next)
 | |
| 	d.dfa.states[state].next[b] = nextState
 | |
| 	return nextState
 | |
| }
 | |
| 
 | |
| func instsKey(insts []uint, buf []byte) []byte {
 | |
| 	if cap(buf) < 8*len(insts) {
 | |
| 		buf = make([]byte, 8*len(insts))
 | |
| 	} else {
 | |
| 		buf = buf[0 : 8*len(insts)]
 | |
| 	}
 | |
| 	for i, inst := range insts {
 | |
| 		binary.LittleEndian.PutUint64(buf[i*8:], uint64(inst))
 | |
| 	}
 | |
| 	return buf
 | |
| }
 | |
| 
 | |
| func (d *dfaBuilder) cachedState(set *sparseSet) int {
 | |
| 	var insts []uint
 | |
| 	var isMatch bool
 | |
| 	for i := uint(0); i < uint(set.Len()); i++ {
 | |
| 		ip := set.Get(i)
 | |
| 		switch d.dfa.insts[ip].op {
 | |
| 		case OpRange:
 | |
| 			insts = append(insts, ip)
 | |
| 		case OpMatch:
 | |
| 			isMatch = true
 | |
| 			insts = append(insts, ip)
 | |
| 		}
 | |
| 	}
 | |
| 	if len(insts) == 0 {
 | |
| 		return 0
 | |
| 	}
 | |
| 	d.keyBuf = instsKey(insts, d.keyBuf)
 | |
| 	v, ok := d.cache[string(d.keyBuf)]
 | |
| 	if ok {
 | |
| 		return v
 | |
| 	}
 | |
| 	d.dfa.states = append(d.dfa.states, &state{
 | |
| 		insts: insts,
 | |
| 		next:  make([]int, 256),
 | |
| 		match: isMatch,
 | |
| 	})
 | |
| 	newV := len(d.dfa.states) - 1
 | |
| 	d.cache[string(d.keyBuf)] = newV
 | |
| 	return newV
 | |
| }
 | |
| 
 | |
| type dfa struct {
 | |
| 	insts  prog
 | |
| 	states []*state
 | |
| }
 | |
| 
 | |
| func (d *dfa) add(set *sparseSet, ip uint) {
 | |
| 	if set.Contains(ip) {
 | |
| 		return
 | |
| 	}
 | |
| 	set.Add(ip)
 | |
| 	switch d.insts[ip].op {
 | |
| 	case OpJmp:
 | |
| 		d.add(set, d.insts[ip].to)
 | |
| 	case OpSplit:
 | |
| 		d.add(set, d.insts[ip].splitA)
 | |
| 		d.add(set, d.insts[ip].splitB)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (d *dfa) run(from, to *sparseSet, b byte) bool {
 | |
| 	to.Clear()
 | |
| 	var isMatch bool
 | |
| 	for i := uint(0); i < uint(from.Len()); i++ {
 | |
| 		ip := from.Get(i)
 | |
| 		switch d.insts[ip].op {
 | |
| 		case OpMatch:
 | |
| 			isMatch = true
 | |
| 		case OpRange:
 | |
| 			if d.insts[ip].rangeStart <= b &&
 | |
| 				b <= d.insts[ip].rangeEnd {
 | |
| 				d.add(to, ip+1)
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return isMatch
 | |
| }
 | |
| 
 | |
| type state struct {
 | |
| 	insts []uint
 | |
| 	next  []int
 | |
| 	match bool
 | |
| }
 | |
| 
 | |
| type intStack []int
 | |
| 
 | |
| func (s intStack) Push(v int) intStack {
 | |
| 	return append(s, v)
 | |
| }
 | |
| 
 | |
| func (s intStack) Pop() (intStack, int) {
 | |
| 	l := len(s)
 | |
| 	if l < 1 {
 | |
| 		return s, 0
 | |
| 	}
 | |
| 	return s[:l-1], s[l-1]
 | |
| }
 |