forked from Shiloh/githaven
286 lines
8.9 KiB
Plaintext
286 lines
8.9 KiB
Plaintext
|
// Copyright (c) 2015 Couchbase, Inc.
|
|||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
|||
|
// except in compliance with the License. You may obtain a copy of the License at
|
|||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
|||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
|||
|
// either express or implied. See the License for the specific language governing permissions
|
|||
|
// and limitations under the License.
|
|||
|
|
|||
|
// +build BUILDTAGS
|
|||
|
|
|||
|
package segment
|
|||
|
|
|||
|
import (
|
|||
|
"fmt"
|
|||
|
"unicode/utf8"
|
|||
|
)
|
|||
|
|
|||
|
var RagelFlags = "RAGELFLAGS"
|
|||
|
|
|||
|
var ParseError = fmt.Errorf("unicode word segmentation parse error")
|
|||
|
|
|||
|
// Word Types
|
|||
|
const (
|
|||
|
None = iota
|
|||
|
Number
|
|||
|
Letter
|
|||
|
Kana
|
|||
|
Ideo
|
|||
|
)
|
|||
|
|
|||
|
%%{
|
|||
|
machine s;
|
|||
|
write data;
|
|||
|
}%%
|
|||
|
|
|||
|
func segmentWords(data []byte, maxTokens int, atEOF bool, val [][]byte, types []int) ([][]byte, []int, int, error) {
|
|||
|
cs, p, pe := 0, 0, len(data)
|
|||
|
cap := maxTokens
|
|||
|
if cap < 0 {
|
|||
|
cap = 1000
|
|||
|
}
|
|||
|
if val == nil {
|
|||
|
val = make([][]byte, 0, cap)
|
|||
|
}
|
|||
|
if types == nil {
|
|||
|
types = make([]int, 0, cap)
|
|||
|
}
|
|||
|
|
|||
|
// added for scanner
|
|||
|
ts := 0
|
|||
|
te := 0
|
|||
|
act := 0
|
|||
|
eof := pe
|
|||
|
_ = ts // compiler not happy
|
|||
|
_ = te
|
|||
|
_ = act
|
|||
|
|
|||
|
// our state
|
|||
|
startPos := 0
|
|||
|
endPos := 0
|
|||
|
totalConsumed := 0
|
|||
|
%%{
|
|||
|
|
|||
|
include SCRIPTS "ragel/uscript.rl";
|
|||
|
include WB "ragel/uwb.rl";
|
|||
|
|
|||
|
action startToken {
|
|||
|
startPos = p
|
|||
|
}
|
|||
|
|
|||
|
action endToken {
|
|||
|
endPos = p
|
|||
|
}
|
|||
|
|
|||
|
action finishNumericToken {
|
|||
|
if !atEOF {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
|
|||
|
val = append(val, data[startPos:endPos+1])
|
|||
|
types = append(types, Number)
|
|||
|
totalConsumed = endPos+1
|
|||
|
if maxTokens > 0 && len(val) >= maxTokens {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
action finishHangulToken {
|
|||
|
if endPos+1 == pe && !atEOF {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
|
|||
|
val = append(val, data[startPos:endPos+1])
|
|||
|
types = append(types, Letter)
|
|||
|
totalConsumed = endPos+1
|
|||
|
if maxTokens > 0 && len(val) >= maxTokens {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
action finishKatakanaToken {
|
|||
|
if endPos+1 == pe && !atEOF {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
|
|||
|
val = append(val, data[startPos:endPos+1])
|
|||
|
types = append(types, Ideo)
|
|||
|
totalConsumed = endPos+1
|
|||
|
if maxTokens > 0 && len(val) >= maxTokens {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
action finishWordToken {
|
|||
|
if !atEOF {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
val = append(val, data[startPos:endPos+1])
|
|||
|
types = append(types, Letter)
|
|||
|
totalConsumed = endPos+1
|
|||
|
if maxTokens > 0 && len(val) >= maxTokens {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
action finishHanToken {
|
|||
|
if endPos+1 == pe && !atEOF {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
|
|||
|
val = append(val, data[startPos:endPos+1])
|
|||
|
types = append(types, Ideo)
|
|||
|
totalConsumed = endPos+1
|
|||
|
if maxTokens > 0 && len(val) >= maxTokens {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
action finishHiraganaToken {
|
|||
|
if endPos+1 == pe && !atEOF {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
|
|||
|
val = append(val, data[startPos:endPos+1])
|
|||
|
types = append(types, Ideo)
|
|||
|
totalConsumed = endPos+1
|
|||
|
if maxTokens > 0 && len(val) >= maxTokens {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
action finishNoneToken {
|
|||
|
lastPos := startPos
|
|||
|
for lastPos <= endPos {
|
|||
|
_, size := utf8.DecodeRune(data[lastPos:])
|
|||
|
lastPos += size
|
|||
|
}
|
|||
|
endPos = lastPos -1
|
|||
|
p = endPos
|
|||
|
|
|||
|
if endPos+1 == pe && !atEOF {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
// otherwise, consume this as well
|
|||
|
val = append(val, data[startPos:endPos+1])
|
|||
|
types = append(types, None)
|
|||
|
totalConsumed = endPos+1
|
|||
|
if maxTokens > 0 && len(val) >= maxTokens {
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
HangulEx = Hangul ( Extend | Format )*;
|
|||
|
HebrewOrALetterEx = ( Hebrew_Letter | ALetter ) ( Extend | Format )*;
|
|||
|
NumericEx = Numeric ( Extend | Format )*;
|
|||
|
KatakanaEx = Katakana ( Extend | Format )*;
|
|||
|
MidLetterEx = ( MidLetter | MidNumLet | Single_Quote ) ( Extend | Format )*;
|
|||
|
MidNumericEx = ( MidNum | MidNumLet | Single_Quote ) ( Extend | Format )*;
|
|||
|
ExtendNumLetEx = ExtendNumLet ( Extend | Format )*;
|
|||
|
HanEx = Han ( Extend | Format )*;
|
|||
|
HiraganaEx = Hiragana ( Extend | Format )*;
|
|||
|
SingleQuoteEx = Single_Quote ( Extend | Format )*;
|
|||
|
DoubleQuoteEx = Double_Quote ( Extend | Format )*;
|
|||
|
HebrewLetterEx = Hebrew_Letter ( Extend | Format )*;
|
|||
|
RegionalIndicatorEx = Regional_Indicator ( Extend | Format )*;
|
|||
|
NLCRLF = Newline | CR | LF;
|
|||
|
OtherEx = ^(NLCRLF) ( Extend | Format )* ;
|
|||
|
|
|||
|
# UAX#29 WB8. Numeric × Numeric
|
|||
|
# WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
|
|||
|
# WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
|
|||
|
# WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
|||
|
# WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
|||
|
#
|
|||
|
WordNumeric = ( ( ExtendNumLetEx )* NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )* ( ExtendNumLetEx )* ) >startToken @endToken;
|
|||
|
|
|||
|
# subset of the below for typing purposes only!
|
|||
|
WordHangul = ( HangulEx )+ >startToken @endToken;
|
|||
|
WordKatakana = ( KatakanaEx )+ >startToken @endToken;
|
|||
|
|
|||
|
# UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
|
|||
|
# WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
|
|||
|
# WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
|
|||
|
# WB7a. Hebrew_Letter × Single_Quote
|
|||
|
# WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
|
|||
|
# WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
|
|||
|
# WB9. (ALetter | Hebrew_Letter) × Numeric
|
|||
|
# WB10. Numeric × (ALetter | Hebrew_Letter)
|
|||
|
# WB13. Katakana × Katakana
|
|||
|
# WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
|||
|
# WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
|||
|
#
|
|||
|
# Marty -deviated here to allow for (ExtendNumLetEx x ExtendNumLetEx) part of 13a
|
|||
|
#
|
|||
|
Word = ( ( ExtendNumLetEx )* ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
|
|||
|
| ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
|
|||
|
| NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
|
|||
|
| HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
|
|||
|
|ExtendNumLetEx
|
|||
|
)+
|
|||
|
)
|
|||
|
(
|
|||
|
( ExtendNumLetEx )+ ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
|
|||
|
| ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
|
|||
|
| NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
|
|||
|
| HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
|
|||
|
)+
|
|||
|
)
|
|||
|
)* ExtendNumLetEx*) >startToken @endToken;
|
|||
|
|
|||
|
# UAX#29 WB14. Any ÷ Any
|
|||
|
WordHan = HanEx >startToken @endToken;
|
|||
|
WordHiragana = HiraganaEx >startToken @endToken;
|
|||
|
|
|||
|
WordExt = ( ( Extend | Format )* ) >startToken @endToken; # maybe plus not star
|
|||
|
|
|||
|
WordCRLF = (CR LF) >startToken @endToken;
|
|||
|
|
|||
|
WordCR = CR >startToken @endToken;
|
|||
|
|
|||
|
WordLF = LF >startToken @endToken;
|
|||
|
|
|||
|
WordNL = Newline >startToken @endToken;
|
|||
|
|
|||
|
WordRegional = (RegionalIndicatorEx+) >startToken @endToken;
|
|||
|
|
|||
|
Other = OtherEx >startToken @endToken;
|
|||
|
|
|||
|
main := |*
|
|||
|
WordNumeric => finishNumericToken;
|
|||
|
WordHangul => finishHangulToken;
|
|||
|
WordKatakana => finishKatakanaToken;
|
|||
|
Word => finishWordToken;
|
|||
|
WordHan => finishHanToken;
|
|||
|
WordHiragana => finishHiraganaToken;
|
|||
|
WordRegional =>finishNoneToken;
|
|||
|
WordCRLF => finishNoneToken;
|
|||
|
WordCR => finishNoneToken;
|
|||
|
WordLF => finishNoneToken;
|
|||
|
WordNL => finishNoneToken;
|
|||
|
WordExt => finishNoneToken;
|
|||
|
Other => finishNoneToken;
|
|||
|
*|;
|
|||
|
|
|||
|
write init;
|
|||
|
write exec;
|
|||
|
}%%
|
|||
|
|
|||
|
if cs < s_first_final {
|
|||
|
return val, types, totalConsumed, ParseError
|
|||
|
}
|
|||
|
|
|||
|
return val, types, totalConsumed, nil
|
|||
|
}
|