kifflom/lexer/lexer.go

// This lexer is based on ideas and code presented in the talk by Rob Pike
// called "Lexical Scanning in Go". More info could be found in Golang's Blog:
// http://blog.golang.org/two-go-talks-lexical-scanning-in-go-and
package lexer

import (
	"fmt"
	"strings"
	"unicode/utf8"
)

type (
	// Holds the state of the scanner
	Lexer struct {
		input   string    // The string being scanned
		lineNum int       // Line number
		pos     int       // Current position in the input
		start   int       // Start position of this item
		width   int       // Width of last rune read from input
		items   chan Item // Channel of scanned items
	}

	// Represents a token returned from the scanner
	Item struct {
		Token Token  // The type of this item
		Val   string // The value of this item
		Pos   int    // The starting position, in bytes, of this item in the input string
	}

	// Identifies the type of the item
	Token int

	// Represents the state of the scanner as a function that returns the next state
	stateFn func(*Lexer) stateFn
)

const (
	// Special
	Error Token = iota
	EOF

	// Symbols
	BraceOpen    // {
	BraceClose   // }
	BracketOpen  // [
	BracketClose // [
	Quote        // "
	Colon        // :
	Comma        // ,

	// Types
	Null
	Bool
	Number
	String
)

// Creates a new scanner for the input string
func New(input string) *Lexer {
	return &Lexer{
		input: input,
		items: make(chan Item),
	}
}

// Starts the state machine for the lexer
func (l *Lexer) Run() {
	for state := lexInitial; state != nil; {
		state = state(l)
	}
}

// Returns the next scanned item and a boolean, which is false on EOF
func (l *Lexer) NextItem() (item Item, ok bool) {
	item, ok = <-l.items
	return
}

// Returns the next rune in the input
func (l *Lexer) next() rune {
	if int(l.pos) >= len(l.input) {
		l.width = 0
		return 0
	}
	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
	l.width = w
	l.pos += l.width
	return r
}

// Returns but does not consume the next rune in the input
func (l *Lexer) peek() rune {
	r := l.next()
	l.backup()
	return r
}

// Tells if the following input matches the given string
func (l *Lexer) acceptString(s string) (ok bool) {
	if strings.HasPrefix(l.input[l.pos:], s) {
		l.pos += len(s)
		return true
	}
	return false
}

// Steps back one rune
func (l *Lexer) backup() {
	l.pos -= l.width
}

// Skips over the pending input before this point
func (l *Lexer) ignore() {
	l.start = l.pos
}

// Passes an item back to the client
func (l *Lexer) emit(t Token) {
	l.items <- Item{
		Token: t,
		Val:   l.input[l.start:l.pos],
		Pos:   l.start,
	}
	l.start = l.pos
	if t == EOF {
		close(l.items)
	}
}

// Emits an error token with given string as a value and stops lexing
func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
	l.items <- Item{
		Token: Error,
		Val:   fmt.Sprintf(format, args...),
		Pos:   l.start,
	}
	close(l.items)
	return nil
}

//
// States
//

func lexInitial(l *Lexer) stateFn {
	for {
		switch r := l.next(); r {
		case ' ', '\t':
			return lexSpace(l)
		case '\n':
			l.lineNum++
			l.ignore()
		case 'n':
			l.backup()
			return lexNull(l)
		case 't', 'f':
			l.backup()
			return lexBool(l)
		case '1', '2', '3', '4', '5', '6', '7', '8', '9', '0':
			l.backup()
			return lexNumber(l)
		case '"':
			return lexString(l)
		case '[':
			l.emit(BracketOpen)
		case ']':
			l.emit(BracketClose)
		case '{':
			l.emit(BraceOpen)
		case '}':
			l.emit(BraceClose)
		case ':':
			l.emit(Colon)
		case ',':
			l.emit(Comma)
		case 0:
			l.emit(EOF)
			return nil
		default:
			return l.errorf("Unexpected symbol: %c", r)
		}
	}
}

// Skips all spaces in the input until a visible character is found
func lexSpace(l *Lexer) stateFn {
	for {
		if r := l.next(); r != ' ' && r != '\t' {
			l.backup()
			break
		}
	}
	l.ignore()
	return lexInitial
}

func lexNull(l *Lexer) stateFn {
	if l.acceptString("null") {
		l.emit(Null)
	} else {
		return l.errorf("Unexpected token")
	}
	return lexInitial
}

func lexBool(l *Lexer) stateFn {
	if l.acceptString("true") || l.acceptString("false") {
		l.emit(Bool)
	}
	return lexInitial
}

func lexNumber(l *Lexer) stateFn {
	var (
		last    rune
		numDots = 0
	)
	for {
		switch r := l.next(); r {
		case '1', '2', '3', '4', '5', '6', '7', '8', '9', '0':
			last = r
		case '.':
			numDots++
			last = r
		default:
			l.backup()
			if numDots > 1 || last == '.' {
				return l.errorf("Invalid number")
			}
			l.emit(Number)
			return lexInitial
		}
	}
}

func lexString(l *Lexer) stateFn {
	// Skipping opening quote
	l.ignore()
	escaped := false
	for {
		switch r := l.next(); r {
		case '\\':
			escaped = !escaped
		case '"':
			if escaped {
				escaped = false
			} else {
				// Going before closing quote and emitting
				l.backup()
				l.emit(String)
				// Skipping closing quote
				l.next()
				l.ignore()
				return lexInitial
			}
		case '\n':
			l.lineNum++
		case 0:
			return l.errorf("Unterminated string")
		default:
			escaped = false
		}
	}
}

func (i Item) String() string {
	switch i.Token {
	case EOF:
		return "EOF"
	case Error:
		return "Error: " + i.Val
	case BraceOpen:
		return "{"
	case BraceClose:
		return "}"
	case BracketOpen:
		return "["
	case BracketClose:
		return "]"
	case Quote:
		return "\""
	case Colon:
		return ":"
	case Comma:
		return ","
	case Null:
		return "NULL"
	case Bool:
		return "Bool: " + i.Val
	case Number:
		return "Number: " + i.Val
	case String:
		return "String: " + i.Val
	default:
		panic("Unreachable")
	}
}
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// This lexer is based on ideas and code presented in the talk by Rob Pike`
			`// called "Lexical Scanning in Go". More info could be found in Golang's Blog:`
			`// http://blog.golang.org/two-go-talks-lexical-scanning-in-go-and`
Initial commit 2015-02-12 11:12:19 +00:00			`package lexer`

			`import (`
			`"fmt"`
			`"strings"`
			`"unicode/utf8"`
			`)`

			`type (`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Holds the state of the scanner`
Initial commit 2015-02-12 11:12:19 +00:00			`Lexer struct {`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`input string // The string being scanned`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`lineNum int // Line number`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`pos int // Current position in the input`
			`start int // Start position of this item`
			`width int // Width of last rune read from input`
			`items chan Item // Channel of scanned items`
Initial commit 2015-02-12 11:12:19 +00:00			`}`

Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Represents a token returned from the scanner`
JSON lexer 2015-02-16 08:54:53 +00:00			`Item struct {`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`Token Token // The type of this item`
			`Val string // The value of this item`
Lexer tests 2015-02-16 19:04:17 +00:00			`Pos int // The starting position, in bytes, of this item in the input string`
Initial commit 2015-02-12 11:12:19 +00:00			`}`

Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Identifies the type of the item`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`Token int`

Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Represents the state of the scanner as a function that returns the next state`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`stateFn func(*Lexer) stateFn`
Initial commit 2015-02-12 11:12:19 +00:00			`)`

			`const (`
			`// Special`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`Error Token = iota`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`EOF`
Initial commit 2015-02-12 11:12:19 +00:00
			`// Symbols`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`BraceOpen // {`
			`BraceClose // }`
			`BracketOpen // [`
			`BracketClose // [`
			`Quote // "`
			`Colon // :`
			`Comma // ,`
Initial commit 2015-02-12 11:12:19 +00:00
			`// Types`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`Null`
			`Bool`
			`Number`
			`String`
Initial commit 2015-02-12 11:12:19 +00:00			`)`

Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Creates a new scanner for the input string`
Lexer doesn't need a name 2015-02-16 09:02:31 +00:00			`func New(input string) *Lexer {`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`return &Lexer{`
Initial commit 2015-02-12 11:12:19 +00:00			`input: input,`
JSON lexer 2015-02-16 08:54:53 +00:00			`items: make(chan Item),`
Initial commit 2015-02-12 11:12:19 +00:00			`}`
			`}`

Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Starts the state machine for the lexer`
Initial commit 2015-02-12 11:12:19 +00:00			`func (l *Lexer) Run() {`
Lexer doen't need to store state 2015-02-16 17:23:27 +00:00			`for state := lexInitial; state != nil; {`
			`state = state(l)`
Initial commit 2015-02-12 11:12:19 +00:00			`}`
			`}`

Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Returns the next scanned item and a boolean, which is false on EOF`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`func (l *Lexer) NextItem() (item Item, ok bool) {`
			`item, ok = <-l.items`
			`return`
JSON lexer 2015-02-16 08:54:53 +00:00			`}`

Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Returns the next rune in the input`
Initial commit 2015-02-12 11:12:19 +00:00			`func (l *Lexer) next() rune {`
			`if int(l.pos) >= len(l.input) {`
			`l.width = 0`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`return 0`
Initial commit 2015-02-12 11:12:19 +00:00			`}`
			`r, w := utf8.DecodeRuneInString(l.input[l.pos:])`
JSON lexer 2015-02-16 08:54:53 +00:00			`l.width = w`
Initial commit 2015-02-12 11:12:19 +00:00			`l.pos += l.width`
			`return r`
			`}`

Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Returns but does not consume the next rune in the input`
Initial commit 2015-02-12 11:12:19 +00:00			`func (l *Lexer) peek() rune {`
			`r := l.next()`
			`l.backup()`
			`return r`
			`}`

Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Tells if the following input matches the given string`
			`func (l *Lexer) acceptString(s string) (ok bool) {`
			`if strings.HasPrefix(l.input[l.pos:], s) {`
			`l.pos += len(s)`
			`return true`
			`}`
			`return false`
			`}`

			`// Steps back one rune`
Initial commit 2015-02-12 11:12:19 +00:00			`func (l *Lexer) backup() {`
			`l.pos -= l.width`
			`}`

Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Skips over the pending input before this point`
			`func (l *Lexer) ignore() {`
			`l.start = l.pos`
			`}`

			`// Passes an item back to the client`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`func (l *Lexer) emit(t Token) {`
Lexer tests 2015-02-16 19:04:17 +00:00			`l.items <- Item{`
			`Token: t,`
			`Val: l.input[l.start:l.pos],`
			`Pos: l.start,`
			`}`
Initial commit 2015-02-12 11:12:19 +00:00			`l.start = l.pos`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`if t == EOF {`
			`close(l.items)`
			`}`
Initial commit 2015-02-12 11:12:19 +00:00			`}`

Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Emits an error token with given string as a value and stops lexing`
Initial commit 2015-02-12 11:12:19 +00:00			`func (l *Lexer) errorf(format string, args ...interface{}) stateFn {`
Lexer tests 2015-02-16 19:04:17 +00:00			`l.items <- Item{`
			`Token: Error,`
			`Val: fmt.Sprintf(format, args...),`
			`Pos: l.start,`
			`}`
			`close(l.items)`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`return nil`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`}`

			`//`
			`// States`
			`//`

			`func lexInitial(l *Lexer) stateFn {`
			`for {`
			`switch r := l.next(); r {`
			`case ' ', '\t':`
			`return lexSpace(l)`
			`case '\n':`
			`l.lineNum++`
Get rid of loop labels 2015-02-16 11:25:33 +00:00			`l.ignore()`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`case 'n':`
			`l.backup()`
			`return lexNull(l)`
			`case 't', 'f':`
			`l.backup()`
			`return lexBool(l)`
			`case '1', '2', '3', '4', '5', '6', '7', '8', '9', '0':`
			`l.backup()`
			`return lexNumber(l)`
			`case '"':`
			`return lexString(l)`
			`case '[':`
			`l.emit(BracketOpen)`
			`case ']':`
			`l.emit(BracketClose)`
			`case '{':`
			`l.emit(BraceOpen)`
			`case '}':`
			`l.emit(BraceClose)`
			`case ':':`
			`l.emit(Colon)`
			`case ',':`
			`l.emit(Comma)`
			`case 0:`
Get rid of loop labels 2015-02-16 11:25:33 +00:00			`l.emit(EOF)`
			`return nil`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`default:`
Lexer tests 2015-02-16 19:04:17 +00:00			`return l.errorf("Unexpected symbol: %c", r)`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`}`
			`}`
Initial commit 2015-02-12 11:12:19 +00:00			`}`
Simplifying lexer 2015-02-16 11:18:14 +00:00
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`// Skips all spaces in the input until a visible character is found`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`func lexSpace(l *Lexer) stateFn {`
			`for {`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`if r := l.next(); r != ' ' && r != '\t' {`
			`l.backup()`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`break`
			`}`
			`}`
			`l.ignore()`
			`return lexInitial`
			`}`

			`func lexNull(l *Lexer) stateFn {`
			`if l.acceptString("null") {`
			`l.emit(Null)`
			`} else {`
			`return l.errorf("Unexpected token")`
			`}`
			`return lexInitial`
			`}`

			`func lexBool(l *Lexer) stateFn {`
			`if l.acceptString("true") \|\| l.acceptString("false") {`
			`l.emit(Bool)`
			`}`
			`return lexInitial`
			`}`

			`func lexNumber(l *Lexer) stateFn {`
Lexer tests 2015-02-16 19:04:17 +00:00			`var (`
			`last rune`
			`numDots = 0`
			`)`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`for {`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`switch r := l.next(); r {`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`case '1', '2', '3', '4', '5', '6', '7', '8', '9', '0':`
Lexer tests 2015-02-16 19:04:17 +00:00			`last = r`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`case '.':`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`numDots++`
Lexer tests 2015-02-16 19:04:17 +00:00			`last = r`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00			`default:`
			`l.backup()`
Lexer tests 2015-02-16 19:04:17 +00:00			`if numDots > 1 \|\| last == '.' {`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`return l.errorf("Invalid number")`
			`}`
Get rid of loop labels 2015-02-16 11:25:33 +00:00			`l.emit(Number)`
			`return lexInitial`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`}`
			`}`
			`}`

			`func lexString(l *Lexer) stateFn {`
Lexer tests 2015-02-16 19:04:17 +00:00			`// Skipping opening quote`
Don't add wrapping quotes to string value 2015-02-16 17:05:49 +00:00			`l.ignore()`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`escaped := false`
			`for {`
			`switch r := l.next(); r {`
			`case '\\':`
Get rid of loop labels 2015-02-16 11:25:33 +00:00			`escaped = !escaped`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`case '"':`
			`if escaped {`
			`escaped = false`
			`} else {`
Lexer tests 2015-02-16 19:04:17 +00:00			`// Going before closing quote and emitting`
			`l.backup()`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`l.emit(String)`
Lexer tests 2015-02-16 19:04:17 +00:00			`// Skipping closing quote`
			`l.next()`
			`l.ignore()`
Get rid of loop labels 2015-02-16 11:25:33 +00:00			`return lexInitial`
Simplifying lexer 2015-02-16 11:18:14 +00:00			`}`
			`case '\n':`
			`l.lineNum++`
			`case 0:`
			`return l.errorf("Unterminated string")`
			`default:`
			`escaped = false`
			`}`
			`}`
			`}`
Lexer's cosmetic refactor 2015-02-16 17:50:16 +00:00
			`func (i Item) String() string {`
			`switch i.Token {`
			`case EOF:`
			`return "EOF"`
			`case Error:`
			`return "Error: " + i.Val`
			`case BraceOpen:`
			`return "{"`
			`case BraceClose:`
			`return "}"`
			`case BracketOpen:`
			`return "["`
			`case BracketClose:`
			`return "]"`
			`case Quote:`
			`return "\""`
			`case Colon:`
			`return ":"`
			`case Comma:`
			`return ","`
			`case Null:`
			`return "NULL"`
			`case Bool:`
			`return "Bool: " + i.Val`
			`case Number:`
			`return "Number: " + i.Val`
			`case String:`
			`return "String: " + i.Val`
			`default:`
			`panic("Unreachable")`
			`}`
			`}`