* update bleve to master b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2 * remove unused pkg from dep file * change bleve from master to recent revision
		
			
				
	
	
		
			269 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			269 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| //  Copyright (c) 2017 Couchbase, Inc.
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| // 		http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| package utf8
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"unicode/utf8"
 | |
| )
 | |
| 
 | |
| // Sequences is a collection of Sequence
 | |
| type Sequences []Sequence
 | |
| 
 | |
| // NewSequences constructs a collection of Sequence which describe the
 | |
| // byte ranges covered between the start and end runes.
 | |
| func NewSequences(start, end rune) (Sequences, error) {
 | |
| 	rv, _, err := NewSequencesPrealloc(start, end, nil, nil, nil, nil)
 | |
| 	return rv, err
 | |
| }
 | |
| 
 | |
| func NewSequencesPrealloc(start, end rune,
 | |
| 	preallocSequences Sequences,
 | |
| 	preallocRangeStack RangeStack,
 | |
| 	preallocStartBytes, preallocEndBytes []byte) (Sequences, RangeStack, error) {
 | |
| 	rv := preallocSequences[:0]
 | |
| 
 | |
| 	startBytes := preallocStartBytes
 | |
| 	if cap(startBytes) < utf8.UTFMax {
 | |
| 		startBytes = make([]byte, utf8.UTFMax)
 | |
| 	}
 | |
| 	startBytes = startBytes[:utf8.UTFMax]
 | |
| 
 | |
| 	endBytes := preallocEndBytes
 | |
| 	if cap(endBytes) < utf8.UTFMax {
 | |
| 		endBytes = make([]byte, utf8.UTFMax)
 | |
| 	}
 | |
| 	endBytes = endBytes[:utf8.UTFMax]
 | |
| 
 | |
| 	rangeStack := preallocRangeStack[:0]
 | |
| 	rangeStack = rangeStack.Push(scalarRange{start, end})
 | |
| 
 | |
| 	rangeStack, r := rangeStack.Pop()
 | |
| TOP:
 | |
| 	for r != nilScalarRange {
 | |
| 	INNER:
 | |
| 		for {
 | |
| 			r1, r2 := r.split()
 | |
| 			if r1 != nilScalarRange {
 | |
| 				rangeStack = rangeStack.Push(scalarRange{r2.start, r2.end})
 | |
| 				r.start = r1.start
 | |
| 				r.end = r1.end
 | |
| 				continue INNER
 | |
| 			}
 | |
| 			if !r.valid() {
 | |
| 				rangeStack, r = rangeStack.Pop()
 | |
| 				continue TOP
 | |
| 			}
 | |
| 			for i := 1; i < utf8.UTFMax; i++ {
 | |
| 				max := maxScalarValue(i)
 | |
| 				if r.start <= max && max < r.end {
 | |
| 					rangeStack = rangeStack.Push(scalarRange{max + 1, r.end})
 | |
| 					r.end = max
 | |
| 					continue INNER
 | |
| 				}
 | |
| 			}
 | |
| 			asciiRange := r.ascii()
 | |
| 			if asciiRange != nilRange {
 | |
| 				rv = append(rv, Sequence{
 | |
| 					asciiRange,
 | |
| 				})
 | |
| 				rangeStack, r = rangeStack.Pop()
 | |
| 				continue TOP
 | |
| 			}
 | |
| 			for i := uint(1); i < utf8.UTFMax; i++ {
 | |
| 				m := rune((1 << (6 * i)) - 1)
 | |
| 				if (r.start & ^m) != (r.end & ^m) {
 | |
| 					if (r.start & m) != 0 {
 | |
| 						rangeStack = rangeStack.Push(scalarRange{(r.start | m) + 1, r.end})
 | |
| 						r.end = r.start | m
 | |
| 						continue INNER
 | |
| 					}
 | |
| 					if (r.end & m) != m {
 | |
| 						rangeStack = rangeStack.Push(scalarRange{r.end & ^m, r.end})
 | |
| 						r.end = (r.end & ^m) - 1
 | |
| 						continue INNER
 | |
| 					}
 | |
| 				}
 | |
| 			}
 | |
| 			n, m := r.encode(startBytes, endBytes)
 | |
| 			seq, err := SequenceFromEncodedRange(startBytes[0:n], endBytes[0:m])
 | |
| 			if err != nil {
 | |
| 				return nil, nil, err
 | |
| 			}
 | |
| 			rv = append(rv, seq)
 | |
| 			rangeStack, r = rangeStack.Pop()
 | |
| 			continue TOP
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return rv, rangeStack, nil
 | |
| }
 | |
| 
 | |
| // Sequence is a collection of Range
 | |
| type Sequence []Range
 | |
| 
 | |
| // SequenceFromEncodedRange creates sequence from the encoded bytes
 | |
| func SequenceFromEncodedRange(start, end []byte) (Sequence, error) {
 | |
| 	if len(start) != len(end) {
 | |
| 		return nil, fmt.Errorf("byte slices must be the same length")
 | |
| 	}
 | |
| 	switch len(start) {
 | |
| 	case 2:
 | |
| 		return Sequence{
 | |
| 			Range{start[0], end[0]},
 | |
| 			Range{start[1], end[1]},
 | |
| 		}, nil
 | |
| 	case 3:
 | |
| 		return Sequence{
 | |
| 			Range{start[0], end[0]},
 | |
| 			Range{start[1], end[1]},
 | |
| 			Range{start[2], end[2]},
 | |
| 		}, nil
 | |
| 	case 4:
 | |
| 		return Sequence{
 | |
| 			Range{start[0], end[0]},
 | |
| 			Range{start[1], end[1]},
 | |
| 			Range{start[2], end[2]},
 | |
| 			Range{start[3], end[3]},
 | |
| 		}, nil
 | |
| 	}
 | |
| 
 | |
| 	return nil, fmt.Errorf("invalid encoded byte length")
 | |
| }
 | |
| 
 | |
| // Matches checks to see if the provided byte slice matches the Sequence
 | |
| func (u Sequence) Matches(bytes []byte) bool {
 | |
| 	if len(bytes) < len(u) {
 | |
| 		return false
 | |
| 	}
 | |
| 	for i := 0; i < len(u); i++ {
 | |
| 		if !u[i].matches(bytes[i]) {
 | |
| 			return false
 | |
| 		}
 | |
| 	}
 | |
| 	return true
 | |
| }
 | |
| 
 | |
| func (u Sequence) String() string {
 | |
| 	switch len(u) {
 | |
| 	case 1:
 | |
| 		return fmt.Sprintf("%v", u[0])
 | |
| 	case 2:
 | |
| 		return fmt.Sprintf("%v%v", u[0], u[1])
 | |
| 	case 3:
 | |
| 		return fmt.Sprintf("%v%v%v", u[0], u[1], u[2])
 | |
| 	case 4:
 | |
| 		return fmt.Sprintf("%v%v%v%v", u[0], u[1], u[2], u[3])
 | |
| 	default:
 | |
| 		return fmt.Sprintf("invalid utf8 sequence")
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Range describes a single range of byte values
 | |
| type Range struct {
 | |
| 	Start byte
 | |
| 	End   byte
 | |
| }
 | |
| 
 | |
| var nilRange = Range{0xff, 0}
 | |
| 
 | |
| func (u Range) matches(b byte) bool {
 | |
| 	if u.Start <= b && b <= u.End {
 | |
| 		return true
 | |
| 	}
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| func (u Range) String() string {
 | |
| 	if u.Start == u.End {
 | |
| 		return fmt.Sprintf("[%X]", u.Start)
 | |
| 	}
 | |
| 	return fmt.Sprintf("[%X-%X]", u.Start, u.End)
 | |
| }
 | |
| 
 | |
| type scalarRange struct {
 | |
| 	start rune
 | |
| 	end   rune
 | |
| }
 | |
| 
 | |
| var nilScalarRange = scalarRange{0xffff, 0}
 | |
| 
 | |
| func (s *scalarRange) String() string {
 | |
| 	return fmt.Sprintf("ScalarRange(%d,%d)", s.start, s.end)
 | |
| }
 | |
| 
 | |
| // split this scalar range if it overlaps with a surrogate codepoint
 | |
| func (s *scalarRange) split() (scalarRange, scalarRange) {
 | |
| 	if s.start < 0xe000 && s.end > 0xd7ff {
 | |
| 		return scalarRange{
 | |
| 				start: s.start,
 | |
| 				end:   0xd7ff,
 | |
| 			},
 | |
| 			scalarRange{
 | |
| 				start: 0xe000,
 | |
| 				end:   s.end,
 | |
| 			}
 | |
| 	}
 | |
| 	return nilScalarRange, nilScalarRange
 | |
| }
 | |
| 
 | |
| func (s *scalarRange) valid() bool {
 | |
| 	return s.start <= s.end
 | |
| }
 | |
| 
 | |
| func (s *scalarRange) ascii() Range {
 | |
| 	if s.valid() && s.end <= 0x7f {
 | |
| 		return Range{
 | |
| 			Start: byte(s.start),
 | |
| 			End:   byte(s.end),
 | |
| 		}
 | |
| 	}
 | |
| 	return nilRange
 | |
| }
 | |
| 
 | |
| // start and end MUST have capacity for utf8.UTFMax bytes
 | |
| func (s *scalarRange) encode(start, end []byte) (int, int) {
 | |
| 	n := utf8.EncodeRune(start, s.start)
 | |
| 	m := utf8.EncodeRune(end, s.end)
 | |
| 	return n, m
 | |
| }
 | |
| 
 | |
| type RangeStack []scalarRange
 | |
| 
 | |
| func (s RangeStack) Push(v scalarRange) RangeStack {
 | |
| 	return append(s, v)
 | |
| }
 | |
| 
 | |
| func (s RangeStack) Pop() (RangeStack, scalarRange) {
 | |
| 	l := len(s)
 | |
| 	if l < 1 {
 | |
| 		return s, nilScalarRange
 | |
| 	}
 | |
| 	return s[:l-1], s[l-1]
 | |
| }
 | |
| 
 | |
| func maxScalarValue(nbytes int) rune {
 | |
| 	switch nbytes {
 | |
| 	case 1:
 | |
| 		return 0x007f
 | |
| 	case 2:
 | |
| 		return 0x07FF
 | |
| 	case 3:
 | |
| 		return 0xFFFF
 | |
| 	default:
 | |
| 		return 0x10FFFF
 | |
| 	}
 | |
| }
 |