githaven-fork/vendor/github.com/jaytaylor/html2text/html2text.go

536 lines
13 KiB
Go
Raw Normal View History

2016-11-03 22:16:01 +00:00
package html2text
import (
"bytes"
"io"
"regexp"
"strings"
"unicode"
"github.com/olekukonko/tablewriter"
"github.com/ssor/bom"
2016-11-03 22:16:01 +00:00
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
// Options provide toggles and overrides to control specific rendering behaviors.
type Options struct {
PrettyTables bool // Turns on pretty ASCII rendering for table elements.
PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements.
OmitLinks bool // Turns on omitting links
}
// PrettyTablesOptions overrides tablewriter behaviors
type PrettyTablesOptions struct {
AutoFormatHeader bool
AutoWrapText bool
ReflowDuringAutoWrap bool
ColWidth int
ColumnSeparator string
RowSeparator string
CenterSeparator string
HeaderAlignment int
FooterAlignment int
Alignment int
ColumnAlignment []int
NewLine string
HeaderLine bool
RowLine bool
AutoMergeCells bool
Borders tablewriter.Border
}
// NewPrettyTablesOptions creates PrettyTablesOptions with default settings
func NewPrettyTablesOptions() *PrettyTablesOptions {
return &PrettyTablesOptions{
AutoFormatHeader: true,
AutoWrapText: true,
ReflowDuringAutoWrap: true,
ColWidth: tablewriter.MAX_ROW_WIDTH,
ColumnSeparator: tablewriter.COLUMN,
RowSeparator: tablewriter.ROW,
CenterSeparator: tablewriter.CENTER,
HeaderAlignment: tablewriter.ALIGN_DEFAULT,
FooterAlignment: tablewriter.ALIGN_DEFAULT,
Alignment: tablewriter.ALIGN_DEFAULT,
ColumnAlignment: []int{},
NewLine: tablewriter.NEWLINE,
HeaderLine: true,
RowLine: false,
AutoMergeCells: false,
Borders: tablewriter.Border{Left: true, Right: true, Bottom: true, Top: true},
}
}
// FromHTMLNode renders text output from a pre-parsed HTML document.
func FromHTMLNode(doc *html.Node, o ...Options) (string, error) {
var options Options
if len(o) > 0 {
options = o[0]
}
ctx := textifyTraverseContext{
buf: bytes.Buffer{},
options: options,
}
if err := ctx.traverse(doc); err != nil {
return "", err
}
text := strings.TrimSpace(newlineRe.ReplaceAllString(
strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"),
)
return text, nil
}
// FromReader renders text output after parsing HTML for the specified
// io.Reader.
func FromReader(reader io.Reader, options ...Options) (string, error) {
newReader, err := bom.NewReaderWithoutBom(reader)
if err != nil {
return "", err
}
doc, err := html.Parse(newReader)
if err != nil {
return "", err
}
return FromHTMLNode(doc, options...)
}
// FromString parses HTML from the input string, then renders the text form.
func FromString(input string, options ...Options) (string, error) {
bs := bom.CleanBom([]byte(input))
text, err := FromReader(bytes.NewReader(bs), options...)
if err != nil {
return "", err
}
return text, nil
}
2016-11-03 22:16:01 +00:00
var (
spacingRe = regexp.MustCompile(`[ \r\n\t]+`)
newlineRe = regexp.MustCompile(`\n\n+`)
)
// traverseTableCtx holds text-related context.
type textifyTraverseContext struct {
buf bytes.Buffer
2016-11-03 22:16:01 +00:00
prefix string
tableCtx tableTraverseContext
options Options
2016-11-03 22:16:01 +00:00
endsWithSpace bool
justClosedDiv bool
blockquoteLevel int
lineLength int
isPre bool
2016-11-03 22:16:01 +00:00
}
// tableTraverseContext holds table ASCII-form related context.
type tableTraverseContext struct {
header []string
body [][]string
footer []string
tmpRow int
isInFooter bool
}
2016-11-03 22:16:01 +00:00
func (tableCtx *tableTraverseContext) init() {
tableCtx.body = [][]string{}
tableCtx.header = []string{}
tableCtx.footer = []string{}
tableCtx.isInFooter = false
tableCtx.tmpRow = 0
}
2016-11-03 22:16:01 +00:00
func (ctx *textifyTraverseContext) handleElement(node *html.Node) error {
ctx.justClosedDiv = false
2016-11-03 22:16:01 +00:00
switch node.DataAtom {
case atom.Br:
return ctx.emit("\n")
2016-11-03 22:16:01 +00:00
case atom.H1, atom.H2, atom.H3:
subCtx := textifyTraverseContext{}
if err := subCtx.traverseChildren(node); err != nil {
return err
}
2016-11-03 22:16:01 +00:00
str := subCtx.buf.String()
dividerLen := 0
for _, line := range strings.Split(str, "\n") {
if lineLen := len([]rune(line)); lineLen-1 > dividerLen {
dividerLen = lineLen - 1
2016-11-03 22:16:01 +00:00
}
}
var divider string
if node.DataAtom == atom.H1 {
divider = strings.Repeat("*", dividerLen)
} else {
divider = strings.Repeat("-", dividerLen)
}
2016-11-03 22:16:01 +00:00
if node.DataAtom == atom.H3 {
return ctx.emit("\n\n" + str + "\n" + divider + "\n\n")
}
return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n")
2016-11-03 22:16:01 +00:00
case atom.Blockquote:
ctx.blockquoteLevel++
ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " "
if err := ctx.emit("\n"); err != nil {
return err
}
if ctx.blockquoteLevel == 1 {
2016-11-03 22:16:01 +00:00
if err := ctx.emit("\n"); err != nil {
return err
}
}
if err := ctx.traverseChildren(node); err != nil {
return err
}
ctx.blockquoteLevel--
ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel)
if ctx.blockquoteLevel > 0 {
ctx.prefix += " "
}
return ctx.emit("\n\n")
case atom.Div:
if ctx.lineLength > 0 {
if err := ctx.emit("\n"); err != nil {
2016-11-03 22:16:01 +00:00
return err
}
}
if err := ctx.traverseChildren(node); err != nil {
return err
}
var err error
if !ctx.justClosedDiv {
err = ctx.emit("\n")
}
ctx.justClosedDiv = true
return err
case atom.Li:
if err := ctx.emit("* "); err != nil {
return err
}
if err := ctx.traverseChildren(node); err != nil {
return err
}
return ctx.emit("\n")
case atom.B, atom.Strong:
subCtx := textifyTraverseContext{}
subCtx.endsWithSpace = true
if err := subCtx.traverseChildren(node); err != nil {
return err
}
str := subCtx.buf.String()
return ctx.emit("*" + str + "*")
case atom.A:
linkText := ""
// For simple link element content with single text node only, peek at the link text.
if node.FirstChild != nil && node.FirstChild.NextSibling == nil && node.FirstChild.Type == html.TextNode {
linkText = node.FirstChild.Data
}
2016-11-03 22:16:01 +00:00
// If image is the only child, take its alt text as the link text.
if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img {
if altText := getAttrVal(img, "alt"); altText != "" {
if err := ctx.emit(altText); err != nil {
2016-11-03 22:16:01 +00:00
return err
}
}
} else if err := ctx.traverseChildren(node); err != nil {
2016-11-03 22:16:01 +00:00
return err
}
2016-11-03 22:16:01 +00:00
hrefLink := ""
if attrVal := getAttrVal(node, "href"); attrVal != "" {
attrVal = ctx.normalizeHrefLink(attrVal)
// Don't print link href if it matches link element content or if the link is empty.
if !ctx.options.OmitLinks && attrVal != "" && linkText != attrVal {
hrefLink = "( " + attrVal + " )"
2016-11-03 22:16:01 +00:00
}
}
2016-11-03 22:16:01 +00:00
return ctx.emit(hrefLink)
2016-11-03 22:16:01 +00:00
case atom.P, atom.Ul:
return ctx.paragraphHandler(node)
2016-11-03 22:16:01 +00:00
case atom.Table, atom.Tfoot, atom.Th, atom.Tr, atom.Td:
if ctx.options.PrettyTables {
return ctx.handleTableElement(node)
} else if node.DataAtom == atom.Table {
return ctx.paragraphHandler(node)
}
return ctx.traverseChildren(node)
2016-11-03 22:16:01 +00:00
case atom.Pre:
ctx.isPre = true
err := ctx.traverseChildren(node)
ctx.isPre = false
return err
2016-11-03 22:16:01 +00:00
case atom.Style, atom.Script, atom.Head:
// Ignore the subtree.
return nil
2016-11-03 22:16:01 +00:00
default:
return ctx.traverseChildren(node)
}
}
2016-11-03 22:16:01 +00:00
// paragraphHandler renders node children surrounded by double newlines.
func (ctx *textifyTraverseContext) paragraphHandler(node *html.Node) error {
if err := ctx.emit("\n\n"); err != nil {
return err
}
if err := ctx.traverseChildren(node); err != nil {
return err
}
return ctx.emit("\n\n")
}
2016-11-03 22:16:01 +00:00
// handleTableElement is only to be invoked when options.PrettyTables is active.
func (ctx *textifyTraverseContext) handleTableElement(node *html.Node) error {
if !ctx.options.PrettyTables {
panic("handleTableElement invoked when PrettyTables not active")
}
2016-11-03 22:16:01 +00:00
switch node.DataAtom {
case atom.Table:
if err := ctx.emit("\n\n"); err != nil {
return err
}
// Re-intialize all table context.
ctx.tableCtx.init()
// Browse children, enriching context with table data.
if err := ctx.traverseChildren(node); err != nil {
return err
}
buf := &bytes.Buffer{}
table := tablewriter.NewWriter(buf)
if ctx.options.PrettyTablesOptions != nil {
options := ctx.options.PrettyTablesOptions
table.SetAutoFormatHeaders(options.AutoFormatHeader)
table.SetAutoWrapText(options.AutoWrapText)
table.SetReflowDuringAutoWrap(options.ReflowDuringAutoWrap)
table.SetColWidth(options.ColWidth)
table.SetColumnSeparator(options.ColumnSeparator)
table.SetRowSeparator(options.RowSeparator)
table.SetCenterSeparator(options.CenterSeparator)
table.SetHeaderAlignment(options.HeaderAlignment)
table.SetFooterAlignment(options.FooterAlignment)
table.SetAlignment(options.Alignment)
table.SetColumnAlignment(options.ColumnAlignment)
table.SetNewLine(options.NewLine)
table.SetHeaderLine(options.HeaderLine)
table.SetRowLine(options.RowLine)
table.SetAutoMergeCells(options.AutoMergeCells)
table.SetBorders(options.Borders)
}
table.SetHeader(ctx.tableCtx.header)
table.SetFooter(ctx.tableCtx.footer)
table.AppendBulk(ctx.tableCtx.body)
// Render the table using ASCII.
table.Render()
if err := ctx.emit(buf.String()); err != nil {
return err
}
2016-11-03 22:16:01 +00:00
return ctx.emit("\n\n")
2016-11-03 22:16:01 +00:00
case atom.Tfoot:
ctx.tableCtx.isInFooter = true
if err := ctx.traverseChildren(node); err != nil {
return err
}
ctx.tableCtx.isInFooter = false
case atom.Tr:
ctx.tableCtx.body = append(ctx.tableCtx.body, []string{})
if err := ctx.traverseChildren(node); err != nil {
return err
}
ctx.tableCtx.tmpRow++
2016-11-03 22:16:01 +00:00
case atom.Th:
res, err := ctx.renderEachChild(node)
if err != nil {
return err
2016-11-03 22:16:01 +00:00
}
ctx.tableCtx.header = append(ctx.tableCtx.header, res)
case atom.Td:
res, err := ctx.renderEachChild(node)
if err != nil {
return err
}
if ctx.tableCtx.isInFooter {
ctx.tableCtx.footer = append(ctx.tableCtx.footer, res)
} else {
ctx.tableCtx.body[ctx.tableCtx.tmpRow] = append(ctx.tableCtx.body[ctx.tableCtx.tmpRow], res)
}
}
return nil
}
func (ctx *textifyTraverseContext) traverse(node *html.Node) error {
switch node.Type {
default:
return ctx.traverseChildren(node)
case html.TextNode:
var data string
if ctx.isPre {
data = node.Data
} else {
data = strings.TrimSpace(spacingRe.ReplaceAllString(node.Data, " "))
}
return ctx.emit(data)
case html.ElementNode:
return ctx.handleElement(node)
2016-11-03 22:16:01 +00:00
}
}
func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error {
2016-11-03 22:16:01 +00:00
for c := node.FirstChild; c != nil; c = c.NextSibling {
if err := ctx.traverse(c); err != nil {
return err
}
}
return nil
}
func (ctx *textifyTraverseContext) emit(data string) error {
if data == "" {
2016-11-03 22:16:01 +00:00
return nil
}
var (
lines = ctx.breakLongLines(data)
err error
)
2016-11-03 22:16:01 +00:00
for _, line := range lines {
runes := []rune(line)
startsWithSpace := unicode.IsSpace(runes[0])
if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") {
if err = ctx.buf.WriteByte(' '); err != nil {
return err
}
2016-11-03 22:16:01 +00:00
ctx.lineLength++
}
ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
for _, c := range line {
if _, err = ctx.buf.WriteString(string(c)); err != nil {
2016-11-03 22:16:01 +00:00
return err
}
ctx.lineLength++
if c == '\n' {
ctx.lineLength = 0
if ctx.prefix != "" {
if _, err = ctx.buf.WriteString(ctx.prefix); err != nil {
2016-11-03 22:16:01 +00:00
return err
}
}
}
}
}
return nil
}
const maxLineLen = 74
func (ctx *textifyTraverseContext) breakLongLines(data string) []string {
// Only break lines when in blockquotes.
2016-11-03 22:16:01 +00:00
if ctx.blockquoteLevel == 0 {
return []string{data}
}
var (
ret = []string{}
runes = []rune(data)
l = len(runes)
existing = ctx.lineLength
)
if existing >= maxLineLen {
2016-11-03 22:16:01 +00:00
ret = append(ret, "\n")
existing = 0
}
for l+existing > maxLineLen {
i := maxLineLen - existing
2016-11-03 22:16:01 +00:00
for i >= 0 && !unicode.IsSpace(runes[i]) {
i--
}
if i == -1 {
// No spaces, so go the other way.
i = maxLineLen - existing
2016-11-03 22:16:01 +00:00
for i < l && !unicode.IsSpace(runes[i]) {
i++
}
}
ret = append(ret, string(runes[:i])+"\n")
for i < l && unicode.IsSpace(runes[i]) {
i++
}
runes = runes[i:]
l = len(runes)
existing = 0
}
if len(runes) > 0 {
ret = append(ret, string(runes))
}
return ret
}
func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string {
2016-11-03 22:16:01 +00:00
link = strings.TrimSpace(link)
link = strings.TrimPrefix(link, "mailto:")
return link
}
// renderEachChild visits each direct child of a node and collects the sequence of
// textuual representaitons separated by a single newline.
func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) {
buf := &bytes.Buffer{}
for c := node.FirstChild; c != nil; c = c.NextSibling {
s, err := FromHTMLNode(c, ctx.options)
if err != nil {
return "", err
}
if _, err = buf.WriteString(s); err != nil {
return "", err
}
if c.NextSibling != nil {
if err = buf.WriteByte('\n'); err != nil {
return "", err
}
}
}
return buf.String(), nil
}
2016-11-03 22:16:01 +00:00
func getAttrVal(node *html.Node, attrName string) string {
for _, attr := range node.Attr {
if attr.Key == attrName {
return attr.Val
}
}
return ""
}