githaven/vendor/github.com/blevesearch/segment/segment_words.rl

286 lines
8.9 KiB
Plaintext
Raw Normal View History

// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build BUILDTAGS
package segment
import (
"fmt"
"unicode/utf8"
)
var RagelFlags = "RAGELFLAGS"
var ParseError = fmt.Errorf("unicode word segmentation parse error")
// Word Types
const (
None = iota
Number
Letter
Kana
Ideo
)
%%{
machine s;
write data;
}%%
func segmentWords(data []byte, maxTokens int, atEOF bool, val [][]byte, types []int) ([][]byte, []int, int, error) {
cs, p, pe := 0, 0, len(data)
cap := maxTokens
if cap < 0 {
cap = 1000
}
if val == nil {
val = make([][]byte, 0, cap)
}
if types == nil {
types = make([]int, 0, cap)
}
// added for scanner
ts := 0
te := 0
act := 0
eof := pe
_ = ts // compiler not happy
_ = te
_ = act
// our state
startPos := 0
endPos := 0
totalConsumed := 0
%%{
include SCRIPTS "ragel/uscript.rl";
include WB "ragel/uwb.rl";
action startToken {
startPos = p
}
action endToken {
endPos = p
}
action finishNumericToken {
if !atEOF {
return val, types, totalConsumed, nil
}
val = append(val, data[startPos:endPos+1])
types = append(types, Number)
totalConsumed = endPos+1
if maxTokens > 0 && len(val) >= maxTokens {
return val, types, totalConsumed, nil
}
}
action finishHangulToken {
if endPos+1 == pe && !atEOF {
return val, types, totalConsumed, nil
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
return val, types, totalConsumed, nil
}
val = append(val, data[startPos:endPos+1])
types = append(types, Letter)
totalConsumed = endPos+1
if maxTokens > 0 && len(val) >= maxTokens {
return val, types, totalConsumed, nil
}
}
action finishKatakanaToken {
if endPos+1 == pe && !atEOF {
return val, types, totalConsumed, nil
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
return val, types, totalConsumed, nil
}
val = append(val, data[startPos:endPos+1])
types = append(types, Ideo)
totalConsumed = endPos+1
if maxTokens > 0 && len(val) >= maxTokens {
return val, types, totalConsumed, nil
}
}
action finishWordToken {
if !atEOF {
return val, types, totalConsumed, nil
}
val = append(val, data[startPos:endPos+1])
types = append(types, Letter)
totalConsumed = endPos+1
if maxTokens > 0 && len(val) >= maxTokens {
return val, types, totalConsumed, nil
}
}
action finishHanToken {
if endPos+1 == pe && !atEOF {
return val, types, totalConsumed, nil
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
return val, types, totalConsumed, nil
}
val = append(val, data[startPos:endPos+1])
types = append(types, Ideo)
totalConsumed = endPos+1
if maxTokens > 0 && len(val) >= maxTokens {
return val, types, totalConsumed, nil
}
}
action finishHiraganaToken {
if endPos+1 == pe && !atEOF {
return val, types, totalConsumed, nil
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
return val, types, totalConsumed, nil
}
val = append(val, data[startPos:endPos+1])
types = append(types, Ideo)
totalConsumed = endPos+1
if maxTokens > 0 && len(val) >= maxTokens {
return val, types, totalConsumed, nil
}
}
action finishNoneToken {
lastPos := startPos
for lastPos <= endPos {
_, size := utf8.DecodeRune(data[lastPos:])
lastPos += size
}
endPos = lastPos -1
p = endPos
if endPos+1 == pe && !atEOF {
return val, types, totalConsumed, nil
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
return val, types, totalConsumed, nil
}
// otherwise, consume this as well
val = append(val, data[startPos:endPos+1])
types = append(types, None)
totalConsumed = endPos+1
if maxTokens > 0 && len(val) >= maxTokens {
return val, types, totalConsumed, nil
}
}
HangulEx = Hangul ( Extend | Format )*;
HebrewOrALetterEx = ( Hebrew_Letter | ALetter ) ( Extend | Format )*;
NumericEx = Numeric ( Extend | Format )*;
KatakanaEx = Katakana ( Extend | Format )*;
MidLetterEx = ( MidLetter | MidNumLet | Single_Quote ) ( Extend | Format )*;
MidNumericEx = ( MidNum | MidNumLet | Single_Quote ) ( Extend | Format )*;
ExtendNumLetEx = ExtendNumLet ( Extend | Format )*;
HanEx = Han ( Extend | Format )*;
HiraganaEx = Hiragana ( Extend | Format )*;
SingleQuoteEx = Single_Quote ( Extend | Format )*;
DoubleQuoteEx = Double_Quote ( Extend | Format )*;
HebrewLetterEx = Hebrew_Letter ( Extend | Format )*;
RegionalIndicatorEx = Regional_Indicator ( Extend | Format )*;
NLCRLF = Newline | CR | LF;
OtherEx = ^(NLCRLF) ( Extend | Format )* ;
# UAX#29 WB8. Numeric × Numeric
# WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
# WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
# WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
# WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
#
WordNumeric = ( ( ExtendNumLetEx )* NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )* ( ExtendNumLetEx )* ) >startToken @endToken;
# subset of the below for typing purposes only!
WordHangul = ( HangulEx )+ >startToken @endToken;
WordKatakana = ( KatakanaEx )+ >startToken @endToken;
# UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
# WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
# WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
# WB7a. Hebrew_Letter × Single_Quote
# WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
# WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
# WB9. (ALetter | Hebrew_Letter) × Numeric
# WB10. Numeric × (ALetter | Hebrew_Letter)
# WB13. Katakana × Katakana
# WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
# WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
#
# Marty -deviated here to allow for (ExtendNumLetEx x ExtendNumLetEx) part of 13a
#
Word = ( ( ExtendNumLetEx )* ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
| ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
| NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
| HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
|ExtendNumLetEx
)+
)
(
( ExtendNumLetEx )+ ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
| ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
| NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
| HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
)+
)
)* ExtendNumLetEx*) >startToken @endToken;
# UAX#29 WB14. Any ÷ Any
WordHan = HanEx >startToken @endToken;
WordHiragana = HiraganaEx >startToken @endToken;
WordExt = ( ( Extend | Format )* ) >startToken @endToken; # maybe plus not star
WordCRLF = (CR LF) >startToken @endToken;
WordCR = CR >startToken @endToken;
WordLF = LF >startToken @endToken;
WordNL = Newline >startToken @endToken;
WordRegional = (RegionalIndicatorEx+) >startToken @endToken;
Other = OtherEx >startToken @endToken;
main := |*
WordNumeric => finishNumericToken;
WordHangul => finishHangulToken;
WordKatakana => finishKatakanaToken;
Word => finishWordToken;
WordHan => finishHanToken;
WordHiragana => finishHiraganaToken;
WordRegional =>finishNoneToken;
WordCRLF => finishNoneToken;
WordCR => finishNoneToken;
WordLF => finishNoneToken;
WordNL => finishNoneToken;
WordExt => finishNoneToken;
Other => finishNoneToken;
*|;
write init;
write exec;
}%%
if cs < s_first_final {
return val, types, totalConsumed, ParseError
}
return val, types, totalConsumed, nil
}