githaven-fork/vendor/github.com/blevesearch/segment/maketesttables.go

220 lines
5.4 KiB
Go
Raw Normal View History

2017-01-25 02:43:02 +00:00
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build ignore
package main
import (
"bufio"
"bytes"
"flag"
"fmt"
"io"
"log"
"net/http"
"os"
"os/exec"
"strconv"
"strings"
"unicode"
)
var url = flag.String("url",
"http://www.unicode.org/Public/"+unicode.Version+"/ucd/auxiliary/",
"URL of Unicode database directory")
var verbose = flag.Bool("verbose",
false,
"write data to stdout as it is parsed")
var localFiles = flag.Bool("local",
false,
"data files have been copied to the current directory; for debugging only")
var outputFile = flag.String("output",
"",
"output file for generated tables; default stdout")
var output *bufio.Writer
func main() {
flag.Parse()
setupOutput()
graphemeTests := make([]test, 0)
graphemeComments := make([]string, 0)
graphemeTests, graphemeComments = loadUnicodeData("GraphemeBreakTest.txt", graphemeTests, graphemeComments)
wordTests := make([]test, 0)
wordComments := make([]string, 0)
wordTests, wordComments = loadUnicodeData("WordBreakTest.txt", wordTests, wordComments)
sentenceTests := make([]test, 0)
sentenceComments := make([]string, 0)
sentenceTests, sentenceComments = loadUnicodeData("SentenceBreakTest.txt", sentenceTests, sentenceComments)
fmt.Fprintf(output, fileHeader, *url)
generateTestTables("Grapheme", graphemeTests, graphemeComments)
generateTestTables("Word", wordTests, wordComments)
generateTestTables("Sentence", sentenceTests, sentenceComments)
flushOutput()
}
// WordBreakProperty.txt has the form:
// 05F0..05F2 ; Hebrew_Letter # Lo [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD
// FB1D ; Hebrew_Letter # Lo HEBREW LETTER YOD WITH HIRIQ
func openReader(file string) (input io.ReadCloser) {
if *localFiles {
f, err := os.Open(file)
if err != nil {
log.Fatal(err)
}
input = f
} else {
path := *url + file
resp, err := http.Get(path)
if err != nil {
log.Fatal(err)
}
if resp.StatusCode != 200 {
log.Fatal("bad GET status for "+file, resp.Status)
}
input = resp.Body
}
return
}
func loadUnicodeData(filename string, tests []test, comments []string) ([]test, []string) {
f := openReader(filename)
defer f.Close()
bufioReader := bufio.NewReader(f)
line, err := bufioReader.ReadString('\n')
for err == nil {
tests, comments = parseLine(line, tests, comments)
line, err = bufioReader.ReadString('\n')
}
// if the err was EOF still need to process last value
if err == io.EOF {
tests, comments = parseLine(line, tests, comments)
}
return tests, comments
}
const comment = "#"
const brk = "÷"
const nbrk = "×"
type test [][]byte
func parseLine(line string, tests []test, comments []string) ([]test, []string) {
if strings.HasPrefix(line, comment) {
return tests, comments
}
line = strings.TrimSpace(line)
if len(line) == 0 {
return tests, comments
}
commentStart := strings.Index(line, comment)
comment := strings.TrimSpace(line[commentStart+1:])
if commentStart > 0 {
line = line[0:commentStart]
}
pieces := strings.Split(line, brk)
t := make(test, 0)
for _, piece := range pieces {
piece = strings.TrimSpace(piece)
if len(piece) > 0 {
codePoints := strings.Split(piece, nbrk)
word := ""
for _, codePoint := range codePoints {
codePoint = strings.TrimSpace(codePoint)
r, err := strconv.ParseInt(codePoint, 16, 64)
if err != nil {
log.Printf("err: %v for '%s'", err, string(r))
return tests, comments
}
word += string(r)
}
t = append(t, []byte(word))
}
}
tests = append(tests, t)
comments = append(comments, comment)
return tests, comments
}
func generateTestTables(prefix string, tests []test, comments []string) {
fmt.Fprintf(output, testHeader, prefix)
for i, t := range tests {
fmt.Fprintf(output, "\t\t{\n")
fmt.Fprintf(output, "\t\t\tinput: %#v,\n", bytes.Join(t, []byte{}))
fmt.Fprintf(output, "\t\t\toutput: %s,\n", generateTest(t))
fmt.Fprintf(output, "\t\t\tcomment: `%s`,\n", comments[i])
fmt.Fprintf(output, "\t\t},\n")
}
fmt.Fprintf(output, "}\n")
}
func generateTest(t test) string {
rv := "[][]byte{"
for _, te := range t {
rv += fmt.Sprintf("%#v,", te)
}
rv += "}"
return rv
}
const fileHeader = `// Generated by running
// maketesttables --url=%s
// DO NOT EDIT
package segment
`
const testHeader = `var unicode%sTests = []struct {
input []byte
output [][]byte
comment string
}{
`
func setupOutput() {
output = bufio.NewWriter(startGofmt())
}
// startGofmt connects output to a gofmt process if -output is set.
func startGofmt() io.Writer {
if *outputFile == "" {
return os.Stdout
}
stdout, err := os.Create(*outputFile)
if err != nil {
log.Fatal(err)
}
// Pipe output to gofmt.
gofmt := exec.Command("gofmt")
fd, err := gofmt.StdinPipe()
if err != nil {
log.Fatal(err)
}
gofmt.Stdout = stdout
gofmt.Stderr = os.Stderr
err = gofmt.Start()
if err != nil {
log.Fatal(err)
}
return fd
}
func flushOutput() {
err := output.Flush()
if err != nil {
log.Fatal(err)
}
}