lex.go

// Copyright (c) 2014 Alex Kalyvitis
// Portions Copyright (c) 2011 The Go Authors

package mustache

import (
	"bytes"
	"fmt"
	"strings"
	"unicode"
	"unicode/utf8"
)

// token represents a token or text string returned from the scanner.
type token struct {
	typ  tokenType
	val  string
	line int
	col  int
}

// String satisfies the fmt.Stringer interface making it easier to print tokens.
func (i token) String() string {
	return fmt.Sprintf("%s:%q", i.typ, i.val)
}

// tokenType identifies the type of lex tokens.
type tokenType int

const (
	tokenError tokenType = iota // error occurred; value is text of error
	tokenEOF
	tokenIdentifier     // alphanumeric identifier
	tokenLeftDelim      // {{ left action delimiter
	tokenRightDelim     // }} right action delimiter
	tokenText           // plain text
	tokenComment        // {{! this is a comment and is ignored}}
	tokenSectionStart   // {{#foo}} denotes a section start
	tokenSectionInverse // {{^foo}} denotes an inverse section start
	tokenSectionEnd     // {{/foo}} denotes the closing of a section
	tokenRawStart       // { denotes the beginning of an unencoded identifier
	tokenRawEnd         // } denotes the end of an unencoded identifier
	tokenRawAlt         // {{&foo}} is an alternative way to define raw tags
	tokenPartial        // {{>foo}} denotes a partial
	tokenSetDelim       // {{={% %}=}} sets delimiters to {% and %}
	tokenSetLeftDelim   // denotes a custom left delimiter
	tokenSetRightDelim  // denotes a custom right delimiter
)

// Make the types prettyprint.
var tokenName = map[tokenType]string{
	tokenError:          "t_error",
	tokenEOF:            "t_eof",
	tokenIdentifier:     "t_ident",
	tokenLeftDelim:      "t_left_delim",
	tokenRightDelim:     "t_right_delim",
	tokenText:           "t_text",
	tokenComment:        "t_comment",
	tokenSectionStart:   "t_section_start",
	tokenSectionInverse: "t_section_inverse",
	tokenSectionEnd:     "t_section_end",
	tokenRawStart:       "t_raw_start",
	tokenRawEnd:         "t_raw_end",
	tokenRawAlt:         "t_raw_alt",
	tokenPartial:        "t_partial",
	tokenSetDelim:       "t_set_delim",
	tokenSetLeftDelim:   "t_set_left_delim",
	tokenSetRightDelim:  "t_set_right_delim",
}

// String satisfies the fmt.Stringer interface making it easier to print tokens.
func (i tokenType) String() string {
	s := tokenName[i]
	if s == "" {
		return fmt.Sprintf("t_unknown_%d", int(i))
	}
	return s
}

const eof = -1

// stateFn represents the state of the scanner as a function that returns the
// next state.
type stateFn func(*lexer) stateFn

// lexer holds the state of the scanner.
type lexer struct {
	name       string     // the name of the input; used only for error reports.
	input      string     // the string being scanned.
	leftDelim  string     // start of action.
	rightDelim string     // end of action.
	state      stateFn    // the next lexing function to enter.
	pos        int        // current position in the input.
	start      int        // start position of this token.
	width      int        // width of last rune read from input.
	tokens     chan token // channel of scanned tokens.
}

// next returns the next rune in the input.
func (l *lexer) next() (r rune) {
	if l.pos >= len(l.input) {
		l.width = 0
		return eof
	}
	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
	l.pos += l.width
	return r
}

// seek advances the pointer by n spaces.
func (l *lexer) seek(n int) {
	l.pos += n
}

// peek returns but does not consume the next rune in the input.
func (l *lexer) peek() rune {
	r := l.next()
	l.backup()
	return r
}

// backup steps back one rune. Can only be called once per call of next.
func (l *lexer) backup() {
	l.pos -= l.width
}

// emit passes an token back to the client.
func (l *lexer) emit(t tokenType) {
	l.tokens <- token{
		t,
		l.input[l.start:l.pos],
		l.lineNum(),
		l.columnNum(),
	}
	l.start = l.pos
}

// ignore skips over the pending input before this point.
func (l *lexer) ignore() {
	l.start = l.pos
}

// lineNum reports which line we're on. Doing it this way
// means we don't have to worry about peek double counting.
func (l *lexer) lineNum() int {
	return 1 + strings.Count(l.input[:l.pos], "\n")
}

// columnNum reports the character of the current line we're on.
func (l *lexer) columnNum() int {
	if lf := strings.LastIndex(l.input[:l.pos], "\n"); lf != -1 {
		return len(l.input[lf+1 : l.pos])
	}
	return len(l.input[:l.pos])
}

// error returns an error token and terminates the scan by passing
// back a nil pointer that will be the next state, terminating l.token.
func (l *lexer) errorf(format string, args ...interface{}) stateFn {
	l.tokens <- token{
		tokenError,
		fmt.Sprintf(format, args...),
		l.lineNum(),
		l.columnNum(),
	}
	return nil
}

// token returns the next token from the input.
func (l *lexer) token() token {
	for {
		select {
		case token := <-l.tokens:
			return token
		default:
			l.state = l.state(l)
		}
	}
}

func (l *lexer) String() string {
	w := bytes.NewBuffer(nil)
	fmt.Fprintf(w, "Template: %q\n", l.input)
	fmt.Fprintf(w, "Index   : %q\n", l.pos)
	fmt.Fprintf(w, "Current : %q\n", l.input[l.pos])
	fmt.Fprintf(w, "Buffer  : %q\n", l.input[l.start:l.pos])
	return w.String()
}

// newLexer creates a new scanner for the input string.
func newLexer(input, left, right string) *lexer {
	l := &lexer{
		input:      input,
		leftDelim:  left,
		rightDelim: right,
		tokens:     make(chan token, 2),
	}
	l.state = stateText // initial state
	return l
}

// state functions.

// stateText scans until an opening action delimiter, "{{".
func stateText(l *lexer) stateFn {
	for {
		// Lookahead for {{ which should switch to lexing an open tag instead of
		// regular text tokens.
		if strings.HasPrefix(l.input[l.pos:], l.leftDelim) {
			if l.pos > l.start {
				l.emit(tokenText)
			}
			return stateLeftDelim
		}
		// Produce a token and exit the loop if we have reached the end of file.
		if l.next() == eof {
			break
		}
	}
	// Emit whatever we gathered so far as text.
	if l.pos > l.start {
		l.emit(tokenText)
	}
	// Always end with EOF token. The parser will keep asking for tokens until
	// an tokenEOF or tokenError token are encountered.
	l.emit(tokenEOF)
	// The text state doesn't have a default next state.
	return nil
}

// stateLeftDelim scans the left delimiter, which is known to be present.
func stateLeftDelim(l *lexer) stateFn {
	l.seek(len(l.leftDelim))
	if l.peek() == '=' {
		// When the lexer encounters "{{=" it proceeds to the set delimiter
		// state which alters the left and right delimiters. This operation is
		// hidden from the parser and no tokens are emited.
		l.next()
		return stateSetDelim
	}
	l.emit(tokenLeftDelim)
	return stateTag
}

// stateRightDelim scans the right delimiter, which is known to be present.
func stateRightDelim(l *lexer) stateFn {
	l.seek(len(l.rightDelim))
	l.emit(tokenRightDelim)
	return stateText
}

// stateTag scans the elements inside action delimiters.
func stateTag(l *lexer) stateFn {
	if strings.HasPrefix(l.input[l.pos:], "}"+l.rightDelim) {
		l.seek(1)
		l.emit(tokenRawEnd)
		return stateRightDelim
	}
	if strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
		return stateRightDelim
	}
	switch r := l.next(); {
	case r == eof || r == '\n':
		return l.errorf("unclosed action")
	case whitespace(r):
		l.ignore()
	case r == '!':
		l.emit(tokenComment)
		return stateComment
	case r == '#':
		l.emit(tokenSectionStart)
	case r == '^':
		l.emit(tokenSectionInverse)
	case r == '/':
		l.emit(tokenSectionEnd)
	case r == '&':
		l.emit(tokenRawAlt)
	case r == '>':
		l.emit(tokenPartial)
	case r == '{':
		l.emit(tokenRawStart)
	case alphanum(r):
		l.backup()
		return stateIdent
	default:
		return l.errorf("unrecognized character in action: %#U", r)
	}
	return stateTag
}

// stateIdent scans an alphanumeric or field.
func stateIdent(l *lexer) stateFn {
Loop:
	for {
		switch r := l.next(); {
		case alphanum(r):
			// absorb.
		default:
			l.backup()
			l.emit(tokenIdentifier)
			break Loop
		}
	}
	return stateTag
}

// stateComment scans a comment. The left comment marker is known to be present.
func stateComment(l *lexer) stateFn {
	i := strings.Index(l.input[l.pos:], l.rightDelim)
	if i < 0 {
		return l.errorf("unclosed tag")
	}
	l.seek(i)
	l.emit(tokenText)
	return stateRightDelim
}

// stateSetDelim scans a set of set delimiter tags and replaces the lexers left
// and right delimiters to new values.
func stateSetDelim(l *lexer) stateFn {
	end := "=" + l.rightDelim
	i := strings.Index(l.input[l.pos:], end)
	if i < 0 {
		return l.errorf("unclosed tag")
	}
	delims := strings.Split(l.input[l.pos:l.pos+i], " ") // " | | "
	if len(delims) < 2 {
		l.errorf("set delimiters should be separated by a space")
	}
	delimFn := leftFn
	for _, delim := range delims {
		if delim != "" {
			if delimFn != nil {
				delimFn = delimFn(l, delim)
			}
		}
	}
	l.seek(i + len(end))
	l.ignore()
	l.emit(tokenSetDelim)
	return stateText
}

// delimFn is a self referencing function which helps with setting the right
// delimiter in the right order.
type delimFn func(l *lexer, s string) delimFn

// leftFn sets the left delimiter to s and returns a rightFn.
func leftFn(l *lexer, s string) delimFn {
	l.leftDelim = s
	return rightFn
}

// rightFn sets the right delimiter to s.
func rightFn(l *lexer, s string) delimFn {
	l.rightDelim = s
	return nil
}

// whitespace reports whether r is a space character.
func whitespace(r rune) bool {
	switch r {
	case ' ', '\t', '\n', '\r':
		return true
	}
	return false
}

// alphanum reports whether r is an alphabetic, digit, or underscore.
func alphanum(r rune) bool {
	return r == '_' || r == '.' || unicode.IsLetter(r) || unicode.IsDigit(r)
}