* upgrade to most recent bluemonday * make vendor * update tests for bluemonday * update tests for bluemonday * update tests for bluemonday
		
			
				
	
	
		
			357 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Go
		
	
	
	
		
			Vendored
		
	
	
	
			
		
		
	
	
			357 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Go
		
	
	
	
		
			Vendored
		
	
	
	
| // Copyright 2012 The Gorilla Authors. All rights reserved.
 | |
| // Use of this source code is governed by a BSD-style
 | |
| // license that can be found in the LICENSE file.
 | |
| 
 | |
| package scanner
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"regexp"
 | |
| 	"strings"
 | |
| 	"unicode"
 | |
| 	"unicode/utf8"
 | |
| )
 | |
| 
 | |
| // tokenType identifies the type of lexical tokens.
 | |
| type tokenType int
 | |
| 
 | |
| // String returns a string representation of the token type.
 | |
| func (t tokenType) String() string {
 | |
| 	return tokenNames[t]
 | |
| }
 | |
| 
 | |
| // Token represents a token and the corresponding string.
 | |
| type Token struct {
 | |
| 	Type   tokenType
 | |
| 	Value  string
 | |
| 	Line   int
 | |
| 	Column int
 | |
| }
 | |
| 
 | |
| // String returns a string representation of the token.
 | |
| func (t *Token) String() string {
 | |
| 	if len(t.Value) > 10 {
 | |
| 		return fmt.Sprintf("%s (line: %d, column: %d): %.10q...",
 | |
| 			t.Type, t.Line, t.Column, t.Value)
 | |
| 	}
 | |
| 	return fmt.Sprintf("%s (line: %d, column: %d): %q",
 | |
| 		t.Type, t.Line, t.Column, t.Value)
 | |
| }
 | |
| 
 | |
| // All tokens -----------------------------------------------------------------
 | |
| 
 | |
| // The complete list of tokens in CSS3.
 | |
| const (
 | |
| 	// Scanner flags.
 | |
| 	TokenError tokenType = iota
 | |
| 	TokenEOF
 | |
| 	// From now on, only tokens from the CSS specification.
 | |
| 	TokenIdent
 | |
| 	TokenAtKeyword
 | |
| 	TokenString
 | |
| 	TokenHash
 | |
| 	TokenNumber
 | |
| 	TokenPercentage
 | |
| 	TokenDimension
 | |
| 	TokenURI
 | |
| 	TokenUnicodeRange
 | |
| 	TokenCDO
 | |
| 	TokenCDC
 | |
| 	TokenS
 | |
| 	TokenComment
 | |
| 	TokenFunction
 | |
| 	TokenIncludes
 | |
| 	TokenDashMatch
 | |
| 	TokenPrefixMatch
 | |
| 	TokenSuffixMatch
 | |
| 	TokenSubstringMatch
 | |
| 	TokenChar
 | |
| 	TokenBOM
 | |
| )
 | |
| 
 | |
| // tokenNames maps tokenType's to their names. Used for conversion to string.
 | |
| var tokenNames = map[tokenType]string{
 | |
| 	TokenError:          "error",
 | |
| 	TokenEOF:            "EOF",
 | |
| 	TokenIdent:          "IDENT",
 | |
| 	TokenAtKeyword:      "ATKEYWORD",
 | |
| 	TokenString:         "STRING",
 | |
| 	TokenHash:           "HASH",
 | |
| 	TokenNumber:         "NUMBER",
 | |
| 	TokenPercentage:     "PERCENTAGE",
 | |
| 	TokenDimension:      "DIMENSION",
 | |
| 	TokenURI:            "URI",
 | |
| 	TokenUnicodeRange:   "UNICODE-RANGE",
 | |
| 	TokenCDO:            "CDO",
 | |
| 	TokenCDC:            "CDC",
 | |
| 	TokenS:              "S",
 | |
| 	TokenComment:        "COMMENT",
 | |
| 	TokenFunction:       "FUNCTION",
 | |
| 	TokenIncludes:       "INCLUDES",
 | |
| 	TokenDashMatch:      "DASHMATCH",
 | |
| 	TokenPrefixMatch:    "PREFIXMATCH",
 | |
| 	TokenSuffixMatch:    "SUFFIXMATCH",
 | |
| 	TokenSubstringMatch: "SUBSTRINGMATCH",
 | |
| 	TokenChar:           "CHAR",
 | |
| 	TokenBOM:            "BOM",
 | |
| }
 | |
| 
 | |
| // Macros and productions -----------------------------------------------------
 | |
| // http://www.w3.org/TR/css3-syntax/#tokenization
 | |
| 
 | |
| var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`)
 | |
| 
 | |
| // macros maps macro names to patterns to be expanded.
 | |
| var macros = map[string]string{
 | |
| 	// must be escaped: `\.+*?()|[]{}^$`
 | |
| 	"ident":      `-?{nmstart}{nmchar}*`,
 | |
| 	"name":       `{nmchar}+`,
 | |
| 	"nmstart":    `[a-zA-Z_]|{nonascii}|{escape}`,
 | |
| 	"nonascii":   "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
 | |
| 	"unicode":    `\\[0-9a-fA-F]{1,6}{wc}?`,
 | |
| 	"escape":     "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
 | |
| 	"nmchar":     `[a-zA-Z0-9_-]|{nonascii}|{escape}`,
 | |
| 	"num":        `[0-9]*\.[0-9]+|[0-9]+`,
 | |
| 	"string":     `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`,
 | |
| 	"stringchar": `{urlchar}|[ ]|\\{nl}`,
 | |
| 	"nl":         `[\n\r\f]|\r\n`,
 | |
| 	"w":          `{wc}*`,
 | |
| 	"wc":         `[\t\n\f\r ]`,
 | |
| 
 | |
| 	// urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}]
 | |
| 	// ASCII characters range = `[\u0020-\u007e]`
 | |
| 	// Skip space \u0020 = `[\u0021-\u007e]`
 | |
| 	// Skip quotation mark \0022 = `[\u0021\u0023-\u007e]`
 | |
| 	// Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]`
 | |
| 	// Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]`
 | |
| 	// Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves
 | |
| 	"urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}",
 | |
| }
 | |
| 
 | |
| // productions maps the list of tokens to patterns to be expanded.
 | |
| var productions = map[tokenType]string{
 | |
| 	// Unused regexps (matched using other methods) are commented out.
 | |
| 	TokenIdent:        `{ident}`,
 | |
| 	TokenAtKeyword:    `@{ident}`,
 | |
| 	TokenString:       `{string}`,
 | |
| 	TokenHash:         `#{name}`,
 | |
| 	TokenNumber:       `{num}`,
 | |
| 	TokenPercentage:   `{num}%`,
 | |
| 	TokenDimension:    `{num}{ident}`,
 | |
| 	TokenURI:          `url\({w}(?:{string}|{urlchar}*?){w}\)`,
 | |
| 	TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`,
 | |
| 	//TokenCDO:            `<!--`,
 | |
| 	TokenCDC:      `-->`,
 | |
| 	TokenS:        `{wc}+`,
 | |
| 	TokenComment:  `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`,
 | |
| 	TokenFunction: `{ident}\(`,
 | |
| 	//TokenIncludes:       `~=`,
 | |
| 	//TokenDashMatch:      `\|=`,
 | |
| 	//TokenPrefixMatch:    `\^=`,
 | |
| 	//TokenSuffixMatch:    `\$=`,
 | |
| 	//TokenSubstringMatch: `\*=`,
 | |
| 	//TokenChar:           `[^"']`,
 | |
| 	//TokenBOM:            "\uFEFF",
 | |
| }
 | |
| 
 | |
| // matchers maps the list of tokens to compiled regular expressions.
 | |
| //
 | |
| // The map is filled on init() using the macros and productions defined in
 | |
| // the CSS specification.
 | |
| var matchers = map[tokenType]*regexp.Regexp{}
 | |
| 
 | |
| // matchOrder is the order to test regexps when first-char shortcuts
 | |
| // can't be used.
 | |
| var matchOrder = []tokenType{
 | |
| 	TokenURI,
 | |
| 	TokenFunction,
 | |
| 	TokenUnicodeRange,
 | |
| 	TokenIdent,
 | |
| 	TokenDimension,
 | |
| 	TokenPercentage,
 | |
| 	TokenNumber,
 | |
| 	TokenCDC,
 | |
| }
 | |
| 
 | |
| func init() {
 | |
| 	// replace macros and compile regexps for productions.
 | |
| 	replaceMacro := func(s string) string {
 | |
| 		return "(?:" + macros[s[1:len(s)-1]] + ")"
 | |
| 	}
 | |
| 	for t, s := range productions {
 | |
| 		for macroRegexp.MatchString(s) {
 | |
| 			s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro)
 | |
| 		}
 | |
| 		matchers[t] = regexp.MustCompile("^(?:" + s + ")")
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Scanner --------------------------------------------------------------------
 | |
| 
 | |
| // New returns a new CSS scanner for the given input.
 | |
| func New(input string) *Scanner {
 | |
| 	// Normalize newlines.
 | |
| 	input = strings.Replace(input, "\r\n", "\n", -1)
 | |
| 	return &Scanner{
 | |
| 		input: input,
 | |
| 		row:   1,
 | |
| 		col:   1,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Scanner scans an input and emits tokens following the CSS3 specification.
 | |
| type Scanner struct {
 | |
| 	input string
 | |
| 	pos   int
 | |
| 	row   int
 | |
| 	col   int
 | |
| 	err   *Token
 | |
| }
 | |
| 
 | |
| // Next returns the next token from the input.
 | |
| //
 | |
| // At the end of the input the token type is TokenEOF.
 | |
| //
 | |
| // If the input can't be tokenized the token type is TokenError. This occurs
 | |
| // in case of unclosed quotation marks or comments.
 | |
| func (s *Scanner) Next() *Token {
 | |
| 	if s.err != nil {
 | |
| 		return s.err
 | |
| 	}
 | |
| 	if s.pos >= len(s.input) {
 | |
| 		s.err = &Token{TokenEOF, "", s.row, s.col}
 | |
| 		return s.err
 | |
| 	}
 | |
| 	if s.pos == 0 {
 | |
| 		// Test BOM only once, at the beginning of the file.
 | |
| 		if strings.HasPrefix(s.input, "\uFEFF") {
 | |
| 			return s.emitSimple(TokenBOM, "\uFEFF")
 | |
| 		}
 | |
| 	}
 | |
| 	// There's a lot we can guess based on the first byte so we'll take a
 | |
| 	// shortcut before testing multiple regexps.
 | |
| 	input := s.input[s.pos:]
 | |
| 	switch input[0] {
 | |
| 	case '\t', '\n', '\f', '\r', ' ':
 | |
| 		// Whitespace.
 | |
| 		return s.emitToken(TokenS, matchers[TokenS].FindString(input))
 | |
| 	case '.':
 | |
| 		// Dot is too common to not have a quick check.
 | |
| 		// We'll test if this is a Char; if it is followed by a number it is a
 | |
| 		// dimension/percentage/number, and this will be matched later.
 | |
| 		if len(input) > 1 && !unicode.IsDigit(rune(input[1])) {
 | |
| 			return s.emitSimple(TokenChar, ".")
 | |
| 		}
 | |
| 	case '#':
 | |
| 		// Another common one: Hash or Char.
 | |
| 		if match := matchers[TokenHash].FindString(input); match != "" {
 | |
| 			return s.emitToken(TokenHash, match)
 | |
| 		}
 | |
| 		return s.emitSimple(TokenChar, "#")
 | |
| 	case '@':
 | |
| 		// Another common one: AtKeyword or Char.
 | |
| 		if match := matchers[TokenAtKeyword].FindString(input); match != "" {
 | |
| 			return s.emitSimple(TokenAtKeyword, match)
 | |
| 		}
 | |
| 		return s.emitSimple(TokenChar, "@")
 | |
| 	case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}':
 | |
| 		// More common chars.
 | |
| 		return s.emitSimple(TokenChar, string(input[0]))
 | |
| 	case '"', '\'':
 | |
| 		// String or error.
 | |
| 		match := matchers[TokenString].FindString(input)
 | |
| 		if match != "" {
 | |
| 			return s.emitToken(TokenString, match)
 | |
| 		}
 | |
| 
 | |
| 		s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col}
 | |
| 		return s.err
 | |
| 	case '/':
 | |
| 		// Comment, error or Char.
 | |
| 		if len(input) > 1 && input[1] == '*' {
 | |
| 			match := matchers[TokenComment].FindString(input)
 | |
| 			if match != "" {
 | |
| 				return s.emitToken(TokenComment, match)
 | |
| 			} else {
 | |
| 				s.err = &Token{TokenError, "unclosed comment", s.row, s.col}
 | |
| 				return s.err
 | |
| 			}
 | |
| 		}
 | |
| 		return s.emitSimple(TokenChar, "/")
 | |
| 	case '~':
 | |
| 		// Includes or Char.
 | |
| 		return s.emitPrefixOrChar(TokenIncludes, "~=")
 | |
| 	case '|':
 | |
| 		// DashMatch or Char.
 | |
| 		return s.emitPrefixOrChar(TokenDashMatch, "|=")
 | |
| 	case '^':
 | |
| 		// PrefixMatch or Char.
 | |
| 		return s.emitPrefixOrChar(TokenPrefixMatch, "^=")
 | |
| 	case '$':
 | |
| 		// SuffixMatch or Char.
 | |
| 		return s.emitPrefixOrChar(TokenSuffixMatch, "$=")
 | |
| 	case '*':
 | |
| 		// SubstringMatch or Char.
 | |
| 		return s.emitPrefixOrChar(TokenSubstringMatch, "*=")
 | |
| 	case '<':
 | |
| 		// CDO or Char.
 | |
| 		return s.emitPrefixOrChar(TokenCDO, "<!--")
 | |
| 	}
 | |
| 	// Test all regexps, in order.
 | |
| 	for _, token := range matchOrder {
 | |
| 		if match := matchers[token].FindString(input); match != "" {
 | |
| 			return s.emitToken(token, match)
 | |
| 		}
 | |
| 	}
 | |
| 	// We already handled unclosed quotation marks and comments,
 | |
| 	// so this can only be a Char.
 | |
| 	r, width := utf8.DecodeRuneInString(input)
 | |
| 	token := &Token{TokenChar, string(r), s.row, s.col}
 | |
| 	s.col += width
 | |
| 	s.pos += width
 | |
| 	return token
 | |
| }
 | |
| 
 | |
| // updatePosition updates input coordinates based on the consumed text.
 | |
| func (s *Scanner) updatePosition(text string) {
 | |
| 	width := utf8.RuneCountInString(text)
 | |
| 	lines := strings.Count(text, "\n")
 | |
| 	s.row += lines
 | |
| 	if lines == 0 {
 | |
| 		s.col += width
 | |
| 	} else {
 | |
| 		s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):])
 | |
| 	}
 | |
| 	s.pos += len(text) // while col is a rune index, pos is a byte index
 | |
| }
 | |
| 
 | |
| // emitToken returns a Token for the string v and updates the scanner position.
 | |
| func (s *Scanner) emitToken(t tokenType, v string) *Token {
 | |
| 	token := &Token{t, v, s.row, s.col}
 | |
| 	s.updatePosition(v)
 | |
| 	return token
 | |
| }
 | |
| 
 | |
| // emitSimple returns a Token for the string v and updates the scanner
 | |
| // position in a simplified manner.
 | |
| //
 | |
| // The string is known to have only ASCII characters and to not have a newline.
 | |
| func (s *Scanner) emitSimple(t tokenType, v string) *Token {
 | |
| 	token := &Token{t, v, s.row, s.col}
 | |
| 	s.col += len(v)
 | |
| 	s.pos += len(v)
 | |
| 	return token
 | |
| }
 | |
| 
 | |
| // emitPrefixOrChar returns a Token for type t if the current position
 | |
| // matches the given prefix. Otherwise it returns a Char token using the
 | |
| // first character from the prefix.
 | |
| //
 | |
| // The prefix is known to have only ASCII characters and to not have a newline.
 | |
| func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token {
 | |
| 	if strings.HasPrefix(s.input[s.pos:], prefix) {
 | |
| 		return s.emitSimple(t, prefix)
 | |
| 	}
 | |
| 	return s.emitSimple(TokenChar, string(prefix[0]))
 | |
| }
 |