From ca7f4c9acb9ce7a9f37678ac6e08dfd6564d762c Mon Sep 17 00:00:00 2001 From: Gregory Eremin Date: Mon, 16 Feb 2015 18:18:14 +0700 Subject: [PATCH] Simplifying lexer --- lex.go | 8 +- lexer/lexer.go | 248 ++++++++++++++++++++++++++++++++++++------------- lexer/state.go | 128 ------------------------- test.json | 5 +- 4 files changed, 193 insertions(+), 196 deletions(-) delete mode 100644 lexer/state.go diff --git a/lex.go b/lex.go index fbc0a66..4c971d3 100644 --- a/lex.go +++ b/lex.go @@ -12,12 +12,12 @@ func main() { f, _ := os.Open("test.json") b, _ := ioutil.ReadAll(f) - lex := lexer.New("foo", string(b)) + lex := lexer.New(string(b)) go lex.Run() for { - i := lex.NextItem() - fmt.Println(i) - if i.String() == "EOF" { + if item, ok := lex.NextItem(); ok { + fmt.Println(item) + } else { break } } diff --git a/lexer/lexer.go b/lexer/lexer.go index edc6f68..a15096e 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -9,54 +9,48 @@ import ( type ( // lexer holds the state of the scanner. Lexer struct { - input string // the string being scanned - state stateFn // the next lexing function to enter - lineNum int // Line number - pos int // current position in the input - start int // start position of this item - width int // width of last rune read from input - lastPos int // position of most recent item returned by nextItem - items chan Item // channel of scanned items - parenDepth int // nesting depth of ( ) exprs + input string // the string being scanned + state stateFn // the next lexing function to enter + lineNum int // Line number + pos int // current position in the input + start int // start position of this item + width int // width of last rune read from input + items chan Item // channel of scanned items } + // Item represents a token or text string returned from the scanner. + Item struct { + Token Token // The type of this item. + Pos int // The starting position, in bytes, of this item in the input string. + Val string // The value of this item. + } + + // Token identifies the type of lex items. + Token int + // stateFn represents the state of the scanner as a function that returns the next state. stateFn func(*Lexer) stateFn - - // item represents a token or text string returned from the scanner. - Item struct { - typ itemType // The type of this item. - pos int // The starting position, in bytes, of this item in the input string. - val string // The value of this item. - } - - // itemType identifies the type of lex items. - itemType int ) const ( // Special - itemError itemType = iota // error occurred; value is text of error - itemEOF + Error Token = iota // error occurred; value is text of error + EOF // Symbols - itemBraceOpen // { - itemBraceClose // } - itemBracketOpen // [ - itemBracketClose // [ - itemQuote // " - itemColon // : - itemComma // , + BraceOpen // { + BraceClose // } + BracketOpen // [ + BracketClose // [ + Quote // " + Colon // : + Comma // , // Types - itemNull // null - itemBool // true, false - itemNumber // 0, 2.5 - itemString // "foo" -) - -const ( - EOF = -1 + Null // null + Bool // true, false + Number // 0, 2.5 + String // "foo" ) // lex creates a new scanner for the input string. @@ -75,10 +69,9 @@ func (l *Lexer) Run() { } } -func (l *Lexer) NextItem() Item { - item := <-l.items - l.lastPos = item.pos - return item +func (l *Lexer) NextItem() (item Item, ok bool) { + item, ok = <-l.items + return } // @@ -86,33 +79,33 @@ func (l *Lexer) NextItem() Item { // func (i Item) String() string { - switch i.typ { - case itemEOF: + switch i.Token { + case EOF: return "EOF" - case itemError: - return "Error: " + i.val - case itemBraceOpen: + case Error: + return "Error: " + i.Val + case BraceOpen: return "{" - case itemBraceClose: + case BraceClose: return "}" - case itemBracketOpen: + case BracketOpen: return "[" - case itemBracketClose: + case BracketClose: return "]" - case itemQuote: + case Quote: return "\"" - case itemColon: + case Colon: return ":" - case itemComma: + case Comma: return "," - case itemNull: + case Null: return "NULL" - case itemBool: - return "Bool: " + i.val - case itemNumber: - return "Number: " + i.val - case itemString: - return "String: " + i.val + case Bool: + return "Bool: " + i.Val + case Number: + return "Number: " + i.Val + case String: + return "String: " + i.Val default: panic("Unreachable") } @@ -122,7 +115,7 @@ func (i Item) String() string { func (l *Lexer) next() rune { if int(l.pos) >= len(l.input) { l.width = 0 - return EOF + return 0 } r, w := utf8.DecodeRuneInString(l.input[l.pos:]) l.width = w @@ -143,9 +136,12 @@ func (l *Lexer) backup() { } // emit passes an item back to the client. -func (l *Lexer) emit(t itemType) { +func (l *Lexer) emit(t Token) { l.items <- Item{t, l.start, l.input[l.start:l.pos]} l.start = l.pos + if t == EOF { + close(l.items) + } } // ignore skips over the pending input before this point. @@ -161,9 +157,135 @@ func (l *Lexer) acceptString(s string) (ok bool) { return false } -// errorf returns an error token and terminates the scan by passing -// back a nil pointer that will be the next state, terminating l.nextItem. func (l *Lexer) errorf(format string, args ...interface{}) stateFn { - l.items <- Item{itemError, l.start, fmt.Sprintf(format, args...)} + l.items <- Item{Error, l.start, fmt.Sprintf(format, args...)} + return nil // Stop lexing +} + +// +// States +// + +func lexInitial(l *Lexer) stateFn { +loop: + for { + switch r := l.next(); r { + case ' ', '\t': + return lexSpace(l) + case '\n': + l.lineNum++ + case 'n': + l.backup() + return lexNull(l) + case 't', 'f': + l.backup() + return lexBool(l) + case '1', '2', '3', '4', '5', '6', '7', '8', '9', '0': + l.backup() + return lexNumber(l) + case '"': + return lexString(l) + case '[': + l.emit(BracketOpen) + case ']': + l.emit(BracketClose) + case '{': + l.emit(BraceOpen) + case '}': + l.emit(BraceClose) + case ':': + l.emit(Colon) + case ',': + l.emit(Comma) + case 0: + break loop + default: + panic("Unexpected symbol: " + string(r)) + } + } + + // Correctly reached EOF. + l.emit(EOF) + return nil } + +// Skip all spaces +// One space has already been seen +func lexSpace(l *Lexer) stateFn { + for { + if r := l.peek(); r == ' ' || r == '\t' { + l.next() + } else { + break + } + } + l.ignore() + + return lexInitial +} + +func lexNull(l *Lexer) stateFn { + if l.acceptString("null") { + l.emit(Null) + } else { + return l.errorf("Unexpected token") + } + return lexInitial +} + +func lexBool(l *Lexer) stateFn { + if l.acceptString("true") || l.acceptString("false") { + l.emit(Bool) + } + return lexInitial +} + +func lexNumber(l *Lexer) stateFn { + hasDot := false +loop: + for { + switch r := l.peek(); r { + case '1', '2', '3', '4', '5', '6', '7', '8', '9', '0': + l.next() + case '.': + if hasDot { + return l.errorf("Invalid number") + } else { + hasDot = true + l.next() + } + default: + break loop + } + } + + l.emit(Number) + return lexInitial +} + +func lexString(l *Lexer) stateFn { + escaped := false +loop: + for { + switch r := l.next(); r { + case '\\': + escaped = true + case '"': + if escaped { + escaped = false + } else { + l.emit(String) + break loop + } + case '\n': + l.lineNum++ + case 0: + return l.errorf("Unterminated string") + default: + escaped = false + } + } + + return lexInitial +} diff --git a/lexer/state.go b/lexer/state.go deleted file mode 100644 index 5235a69..0000000 --- a/lexer/state.go +++ /dev/null @@ -1,128 +0,0 @@ -package lexer - -import "strings" - -func lexInitial(l *Lexer) stateFn { -loop: - for { - switch r := l.next(); r { - case ' ', '\t': - return lexSpace(l) - case '\n': - l.lineNum++ - case 'n': - l.backup() - return lexNull(l) - case 't', 'f': - l.backup() - return lexBool(l) - case '1', '2', '3', '4', '5', '6', '7', '8', '9', '0': - l.backup() - return lexNumber(l) - case '"': - return lexString(l) - case '[': - l.emit(itemBracketOpen) - case ']': - l.emit(itemBracketClose) - case '{': - l.emit(itemBraceOpen) - case '}': - l.emit(itemBraceClose) - case ':': - l.emit(itemColon) - case ',': - l.emit(itemComma) - case EOF: - break loop - default: - panic("Unexpected symbol: " + string(r)) - } - } - - // Correctly reached EOF. - l.emit(itemEOF) - - return nil -} - -// Skip all spaces -// One space has already been seen -func lexSpace(l *Lexer) stateFn { - for isSpace(l.peek()) { - l.next() - } - l.ignore() - return lexInitial -} - -func lexNull(l *Lexer) stateFn { - if l.acceptString("null") { - l.emit(itemNull) - } else { - return l.errorf("Unexpected token") - } - return lexInitial -} - -func lexBool(l *Lexer) stateFn { - if l.acceptString("true") || l.acceptString("false") { - l.emit(itemBool) - } - return lexInitial -} - -func lexNumber(l *Lexer) stateFn { - hasDot := false - for { - if r := l.peek(); isDigit(r) { - l.next() - } else if r == '.' { - if hasDot { - return l.errorf("Invalid number") - } else { - hasDot = true - l.next() - } - } else { - break - } - } - - l.emit(itemNumber) - return lexInitial -} - -func lexString(l *Lexer) stateFn { - escaped := false -loop: - for { - switch r := l.next(); r { - case '\\': - escaped = true - case '"': - if escaped { - escaped = false - } else { - l.emit(itemString) - break loop - } - case '\n': - l.lineNum++ - case EOF: - return l.errorf("String hits EOF") - default: - escaped = false - } - } - - return lexInitial -} - -func isSpace(r rune) bool { - return r == ' ' || r == '\t' -} - -func isDigit(r rune) bool { - return strings.IndexRune("1234567890", r) > -1 -} diff --git a/test.json b/test.json index 6200fd0..5d11f4c 100644 --- a/test.json +++ b/test.json @@ -9,5 +9,8 @@ {"length": 13, "weight": 5}, {"length": 18, "weight": 8}, {"length": 13, "weight": 4} - ] + ], + "misc": { + "string with quotes": "beware of \"quoted\" text" + } }