githaven-fork/modules/markup/csv/csv.go
Norwin 867f46f78e Detect delimiter in CSV rendering (#7869)
* detect csv delimiter in csv rendering

fixes #7868

* make linter happy

* fix failing testcase & use ints where possible

* expose markup type to template

previously all markup had the .markdown class, which is incorrect,
as it applies markdown CSS & JS logic to CSV rendering

* fix build (missing `make css`)

* ignore quoted csv content for delimiter scoring

also fix html generation
2019-08-16 01:09:50 +03:00

114 lines
2.6 KiB
Go

// Copyright 2018 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package markup
import (
"bytes"
"encoding/csv"
"html"
"io"
"regexp"
"strings"
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/util"
)
var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
func init() {
markup.RegisterParser(Parser{})
}
// Parser implements markup.Parser for orgmode
type Parser struct {
}
// Name implements markup.Parser
func (Parser) Name() string {
return "csv"
}
// Extensions implements markup.Parser
func (Parser) Extensions() []string {
return []string{".csv", ".tsv"}
}
// Render implements markup.Parser
func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
rd := csv.NewReader(bytes.NewReader(rawBytes))
rd.Comma = p.bestDelimiter(rawBytes)
var tmpBlock bytes.Buffer
tmpBlock.WriteString(`<table class="table">`)
for {
fields, err := rd.Read()
if err == io.EOF {
break
}
if err != nil {
continue
}
tmpBlock.WriteString("<tr>")
for _, field := range fields {
tmpBlock.WriteString("<td>")
tmpBlock.WriteString(html.EscapeString(field))
tmpBlock.WriteString("</td>")
}
tmpBlock.WriteString("</tr>")
}
tmpBlock.WriteString("</table>")
return tmpBlock.Bytes()
}
// bestDelimiter scores the input CSV data against delimiters, and returns the best match.
// Reads at most 10k bytes & 10 lines.
func (p Parser) bestDelimiter(data []byte) rune {
maxLines := 10
maxBytes := util.Min(len(data), 1e4)
text := string(data[:maxBytes])
text = quoteRegexp.ReplaceAllLiteralString(text, "")
lines := strings.SplitN(text, "\n", maxLines+1)
lines = lines[:util.Min(maxLines, len(lines))]
delimiters := []rune{',', ';', '\t', '|'}
bestDelim := delimiters[0]
bestScore := 0.0
for _, delim := range delimiters {
score := p.scoreDelimiter(lines, delim)
if score > bestScore {
bestScore = score
bestDelim = delim
}
}
return bestDelim
}
// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV
func (Parser) scoreDelimiter(lines []string, delim rune) (score float64) {
countTotal := 0
countLineMax := 0
linesNotEqual := 0
for _, line := range lines {
if len(line) == 0 {
continue
}
countLine := strings.Count(line, string(delim))
countTotal += countLine
if countLine != countLineMax {
if countLineMax != 0 {
linesNotEqual++
}
countLineMax = util.Max(countLine, countLineMax)
}
}
return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
}