forked from Shiloh/githaven
251 lines
6.4 KiB
Go
251 lines
6.4 KiB
Go
|
// Copyright (c) 2018 Couchbase, Inc.
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
package levenshtein2
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"math"
|
||
|
)
|
||
|
|
||
|
const SinkState = uint32(0)
|
||
|
|
||
|
type DFA struct {
|
||
|
transitions [][256]uint32
|
||
|
distances []Distance
|
||
|
initState int
|
||
|
ed uint8
|
||
|
}
|
||
|
|
||
|
/// Returns the initial state
|
||
|
func (d *DFA) initialState() int {
|
||
|
return d.initState
|
||
|
}
|
||
|
|
||
|
/// Returns the Levenshtein distance associated to the
|
||
|
/// current state.
|
||
|
func (d *DFA) distance(stateId int) Distance {
|
||
|
return d.distances[stateId]
|
||
|
}
|
||
|
|
||
|
/// Returns the number of states in the `DFA`.
|
||
|
func (d *DFA) numStates() int {
|
||
|
return len(d.transitions)
|
||
|
}
|
||
|
|
||
|
/// Returns the destination state reached after consuming a given byte.
|
||
|
func (d *DFA) transition(fromState int, b uint8) int {
|
||
|
return int(d.transitions[fromState][b])
|
||
|
}
|
||
|
|
||
|
func (d *DFA) eval(bytes []uint8) Distance {
|
||
|
state := d.initialState()
|
||
|
|
||
|
for _, b := range bytes {
|
||
|
state = d.transition(state, b)
|
||
|
}
|
||
|
|
||
|
return d.distance(state)
|
||
|
}
|
||
|
|
||
|
func (d *DFA) Start() int {
|
||
|
return int(d.initialState())
|
||
|
}
|
||
|
|
||
|
func (d *DFA) IsMatch(state int) bool {
|
||
|
if _, ok := d.distance(state).(Exact); ok {
|
||
|
return true
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func (d *DFA) CanMatch(state int) bool {
|
||
|
return state > 0 && state < d.numStates()
|
||
|
}
|
||
|
|
||
|
func (d *DFA) Accept(state int, b byte) int {
|
||
|
return int(d.transition(state, b))
|
||
|
}
|
||
|
|
||
|
// WillAlwaysMatch returns if the specified state will always end in a
|
||
|
// matching state.
|
||
|
func (d *DFA) WillAlwaysMatch(state int) bool {
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func fill(dest []uint32, val uint32) {
|
||
|
for i := range dest {
|
||
|
dest[i] = val
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func fillTransitions(dest *[256]uint32, val uint32) {
|
||
|
for i := range dest {
|
||
|
dest[i] = val
|
||
|
}
|
||
|
}
|
||
|
|
||
|
type Utf8DFAStateBuilder struct {
|
||
|
dfaBuilder *Utf8DFABuilder
|
||
|
stateID uint32
|
||
|
defaultSuccessor []uint32
|
||
|
}
|
||
|
|
||
|
func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8,
|
||
|
toStateID uint32) {
|
||
|
sb.dfaBuilder.transitions[fromStateID][b] = toStateID
|
||
|
}
|
||
|
|
||
|
func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) {
|
||
|
fromStateID := sb.stateID
|
||
|
chars := []byte(string(in))
|
||
|
lastByte := chars[len(chars)-1]
|
||
|
|
||
|
for i, ch := range chars[:len(chars)-1] {
|
||
|
remNumBytes := len(chars) - i - 1
|
||
|
defaultSuccessor := sb.defaultSuccessor[remNumBytes]
|
||
|
intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch]
|
||
|
|
||
|
if intermediateStateID == defaultSuccessor {
|
||
|
intermediateStateID = sb.dfaBuilder.allocate()
|
||
|
fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID],
|
||
|
sb.defaultSuccessor[remNumBytes-1])
|
||
|
}
|
||
|
|
||
|
sb.addTransitionID(fromStateID, ch, intermediateStateID)
|
||
|
fromStateID = intermediateStateID
|
||
|
}
|
||
|
|
||
|
toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID))
|
||
|
sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded)
|
||
|
}
|
||
|
|
||
|
type Utf8StateId uint32
|
||
|
|
||
|
func original(stateId uint32) Utf8StateId {
|
||
|
return predecessor(stateId, 0)
|
||
|
}
|
||
|
|
||
|
func predecessor(stateId uint32, numSteps uint8) Utf8StateId {
|
||
|
return Utf8StateId(stateId*4 + uint32(numSteps))
|
||
|
}
|
||
|
|
||
|
// Utf8DFABuilder makes it possible to define a DFA
|
||
|
// that takes unicode character, and build a `DFA`
|
||
|
// that operates on utf-8 encoded
|
||
|
type Utf8DFABuilder struct {
|
||
|
index []uint32
|
||
|
distances []Distance
|
||
|
transitions [][256]uint32
|
||
|
initialState uint32
|
||
|
numStates uint32
|
||
|
maxNumStates uint32
|
||
|
}
|
||
|
|
||
|
func withMaxStates(maxStates uint32) *Utf8DFABuilder {
|
||
|
rv := &Utf8DFABuilder{
|
||
|
index: make([]uint32, maxStates*2+100),
|
||
|
distances: make([]Distance, 0, maxStates),
|
||
|
transitions: make([][256]uint32, 0, maxStates),
|
||
|
maxNumStates: maxStates,
|
||
|
}
|
||
|
|
||
|
for i := range rv.index {
|
||
|
rv.index[i] = math.MaxUint32
|
||
|
}
|
||
|
|
||
|
return rv
|
||
|
}
|
||
|
|
||
|
func (dfab *Utf8DFABuilder) allocate() uint32 {
|
||
|
newState := dfab.numStates
|
||
|
dfab.numStates++
|
||
|
|
||
|
dfab.distances = append(dfab.distances, Atleast{d: 255})
|
||
|
dfab.transitions = append(dfab.transitions, [256]uint32{})
|
||
|
|
||
|
return newState
|
||
|
}
|
||
|
|
||
|
func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 {
|
||
|
if int(state) >= cap(dfab.index) {
|
||
|
cloneIndex := make([]uint32, int(state)*2)
|
||
|
copy(cloneIndex, dfab.index)
|
||
|
dfab.index = cloneIndex
|
||
|
}
|
||
|
if dfab.index[state] != math.MaxUint32 {
|
||
|
return dfab.index[state]
|
||
|
}
|
||
|
|
||
|
nstate := dfab.allocate()
|
||
|
dfab.index[state] = nstate
|
||
|
|
||
|
return nstate
|
||
|
}
|
||
|
|
||
|
func (dfab *Utf8DFABuilder) setInitialState(iState uint32) {
|
||
|
decodedID := dfab.getOrAllocate(original(iState))
|
||
|
dfab.initialState = decodedID
|
||
|
}
|
||
|
|
||
|
func (dfab *Utf8DFABuilder) build(ed uint8) *DFA {
|
||
|
return &DFA{
|
||
|
transitions: dfab.transitions,
|
||
|
distances: dfab.distances,
|
||
|
initState: int(dfab.initialState),
|
||
|
ed: ed,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32,
|
||
|
distance Distance) (*Utf8DFAStateBuilder, error) {
|
||
|
if state > dfab.maxNumStates {
|
||
|
return nil, fmt.Errorf("State id is larger than maxNumStates")
|
||
|
}
|
||
|
|
||
|
stateID := dfab.getOrAllocate(original(state))
|
||
|
dfab.distances[stateID] = distance
|
||
|
|
||
|
defaultSuccID := dfab.getOrAllocate(original(default_suc_orig))
|
||
|
// creates a chain of states of predecessors of `default_suc_orig`.
|
||
|
// Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]`
|
||
|
// leads to the `default_suc_orig` state.
|
||
|
predecessorStates := []uint32{defaultSuccID,
|
||
|
defaultSuccID,
|
||
|
defaultSuccID,
|
||
|
defaultSuccID}
|
||
|
|
||
|
for numBytes := uint8(1); numBytes < 4; numBytes++ {
|
||
|
predecessorState := predecessor(default_suc_orig, numBytes)
|
||
|
predecessorStateID := dfab.getOrAllocate(predecessorState)
|
||
|
predecessorStates[numBytes] = predecessorStateID
|
||
|
succ := predecessorStates[numBytes-1]
|
||
|
fillTransitions(&dfab.transitions[predecessorStateID], succ)
|
||
|
}
|
||
|
|
||
|
// 1-byte encoded chars.
|
||
|
fill(dfab.transitions[stateID][0:192], predecessorStates[0])
|
||
|
// 2-bytes encoded chars.
|
||
|
fill(dfab.transitions[stateID][192:224], predecessorStates[1])
|
||
|
// 3-bytes encoded chars.
|
||
|
fill(dfab.transitions[stateID][224:240], predecessorStates[2])
|
||
|
// 4-bytes encoded chars.
|
||
|
fill(dfab.transitions[stateID][240:256], predecessorStates[3])
|
||
|
|
||
|
return &Utf8DFAStateBuilder{
|
||
|
dfaBuilder: dfab,
|
||
|
stateID: stateID,
|
||
|
defaultSuccessor: predecessorStates}, nil
|
||
|
}
|