From 0f04fa41e8d5932e16792aa5e52976ff96beac7d Mon Sep 17 00:00:00 2001 From: Gregory Eremin Date: Mon, 16 Feb 2015 15:54:53 +0700 Subject: [PATCH] JSON lexer --- lex.go | 10 +++- lexer/lexer.go | 115 +++++++++++++++++++------------------------- lexer/state.go | 127 ++++++++++++++++++++++++++++++++++++++++--------- test.json | 3 +- 4 files changed, 166 insertions(+), 89 deletions(-) diff --git a/lex.go b/lex.go index 2598cc4..fbc0a66 100644 --- a/lex.go +++ b/lex.go @@ -1,6 +1,7 @@ package main import ( + "fmt" "io/ioutil" "os" @@ -12,5 +13,12 @@ func main() { b, _ := ioutil.ReadAll(f) lex := lexer.New("foo", string(b)) - lex.Run() + go lex.Run() + for { + i := lex.NextItem() + fmt.Println(i) + if i.String() == "EOF" { + break + } + } } diff --git a/lexer/lexer.go b/lexer/lexer.go index 79e3b6f..b842d21 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -11,22 +11,22 @@ type ( Lexer struct { input string // the string being scanned state stateFn // the next lexing function to enter - pos Pos // current position in the input - start Pos // start position of this item - width Pos // width of last rune read from input - lastPos Pos // position of most recent item returned by nextItem - items chan item // channel of scanned items + lineNum int // Line number + pos int // current position in the input + start int // start position of this item + width int // width of last rune read from input + lastPos int // position of most recent item returned by nextItem + items chan Item // channel of scanned items parenDepth int // nesting depth of ( ) exprs } - Pos int // stateFn represents the state of the scanner as a function that returns the next state. stateFn func(*Lexer) stateFn // item represents a token or text string returned from the scanner. - item struct { + Item struct { typ itemType // The type of this item. - pos Pos // The starting position, in bytes, of this item in the input string. + pos int // The starting position, in bytes, of this item in the input string. val string // The value of this item. } @@ -38,7 +38,6 @@ const ( // Special itemError itemType = iota // error occurred; value is text of error itemEOF - itemSpace // Symbols itemBraceOpen // { @@ -54,27 +53,17 @@ const ( itemBool // true, false itemNumber // 0, 2.5 itemString // "foo" - itemArray // [1, 2, 3] - itemObject // {"a": 1, "b": 2} ) const ( EOF = -1 ) -var ( - itemMap = map[string]itemType{ - "null": itemNull, - "true": itemBool, - "false": itemBool, - } -) - // lex creates a new scanner for the input string. func New(name, input string) *Lexer { l := &Lexer{ input: input, - items: make(chan item), + items: make(chan Item), } return l } @@ -86,20 +75,47 @@ func (l *Lexer) Run() { } } +func (l *Lexer) NextItem() Item { + item := <-l.items + l.lastPos = item.pos + return item +} + // // Lexer stuff // -func (i item) String() string { - switch { - case i.typ == itemEOF: +func (i Item) String() string { + switch i.typ { + case itemEOF: return "EOF" - case i.typ == itemError: - return i.val - case len(i.val) > 10: - return fmt.Sprintf("%.10q...", i.val) + case itemError: + return "Error: " + i.val + case itemBraceOpen: + return "{" + case itemBraceClose: + return "}" + case itemBracketOpen: + return "[" + case itemBracketClose: + return "]" + case itemQuote: + return "\"" + case itemColon: + return ":" + case itemComma: + return "," + case itemNull: + return "NULL" + case itemBool: + return "Bool: " + i.val + case itemNumber: + return "Number: " + i.val + case itemString: + return "String: " + i.val + default: + panic("Unreachable") } - return fmt.Sprintf("%q", i.val) } // next returns the next rune in the input. @@ -109,7 +125,7 @@ func (l *Lexer) next() rune { return EOF } r, w := utf8.DecodeRuneInString(l.input[l.pos:]) - l.width = Pos(w) + l.width = w l.pos += l.width return r } @@ -128,7 +144,7 @@ func (l *Lexer) backup() { // emit passes an item back to the client. func (l *Lexer) emit(t itemType) { - l.items <- item{t, l.start, l.input[l.start:l.pos]} + l.items <- Item{t, l.start, l.input[l.start:l.pos]} l.start = l.pos } @@ -137,48 +153,17 @@ func (l *Lexer) ignore() { l.start = l.pos } -// accept consumes the next rune if it's from the valid set. -func (l *Lexer) accept(valid string) bool { - if strings.IndexRune(valid, l.next()) >= 0 { +func (l *Lexer) acceptString(s string) (ok bool) { + if strings.HasPrefix(l.input[l.pos:], s) { + l.pos += len(s) return true } - l.backup() return false } -// acceptRun consumes a run of runes from the valid set. -func (l *Lexer) acceptRun(valid string) { - for strings.IndexRune(valid, l.next()) >= 0 { - } - l.backup() -} - -// lineNumber reports which line we're on, based on the position of -// the previous item returned by nextItem. Doing it this way -// means we don't have to worry about peek double counting. -func (l *Lexer) lineNumber() int { - return 1 + strings.Count(l.input[:l.lastPos], "\n") -} - // errorf returns an error token and terminates the scan by passing // back a nil pointer that will be the next state, terminating l.nextItem. func (l *Lexer) errorf(format string, args ...interface{}) stateFn { - l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)} + l.items <- Item{itemError, l.start, fmt.Sprintf(format, args...)} return nil } - -// nextItem returns the next item from the input. -func (l *Lexer) nextItem() item { - item := <-l.items - l.lastPos = item.pos - return item -} - -// -// Helpers -// - -// isSpace reports whether r is a space character. -func isSpace(r rune) bool { - return r == ' ' || r == '\t' -} diff --git a/lexer/state.go b/lexer/state.go index 710b8fb..d7541ff 100644 --- a/lexer/state.go +++ b/lexer/state.go @@ -1,12 +1,42 @@ package lexer +import "strings" + func lexInitial(l *Lexer) stateFn { +loop: for { - switch l.next() { + switch r := l.next(); r { + case ' ', '\t': + return lexSpace(l) + case '\n': + l.lineNum++ + case 'n': + l.backup() + return lexNull(l) + case 't', 'f': + l.backup() + return lexBool(l) + case '1', '2', '3', '4', '5', '6', '7', '8', '9', '0': + l.backup() + return lexNumber(l) + case '"': + return lexString(l) + case '[': + l.emit(itemBracketOpen) + case ']': + l.emit(itemBracketClose) + case '{': + l.emit(itemBraceOpen) + case '}': + l.emit(itemBraceClose) + case ':': + l.emit(itemColon) + case ',': + l.emit(itemComma) case EOF: - break + break loop default: - panic("Unexpected symbol!") + panic("Unexpected symbol: " + string(r)) } } @@ -16,28 +46,81 @@ func lexInitial(l *Lexer) stateFn { return nil } -func lexNumber(l *Lexer) stateFn { - return lexInitial -} - -func lexString(l *Lexer) stateFn { - return lexInitial -} - -func lexArray(l *Lexer) stateFn { - return lexInitial -} - -func lexObject(l *Lexer) stateFn { - return lexInitial -} - -// lexSpace scans a run of space characters. -// One space has already been seen. +// Skip all spaces +// One space has already been seen func lexSpace(l *Lexer) stateFn { for isSpace(l.peek()) { l.next() } - l.emit(itemSpace) + l.ignore() return lexInitial } + +func lexNull(l *Lexer) stateFn { + if l.acceptString("null") { + l.emit(itemNull) + } else { + return l.errorf("Unexpected token") + } + return lexInitial +} + +func lexBool(l *Lexer) stateFn { + if l.acceptString("true") || l.acceptString("false") { + l.emit(itemBool) + } + return lexInitial +} + +func lexNumber(l *Lexer) stateFn { + hasDot := false + for { + if r := l.peek(); isDigit(r) { + l.next() + } else if r == '.' { + if hasDot { + return l.errorf("Invalid number") + } else { + hasDot = true + l.next() + } + } else { + break + } + } + + l.emit(itemNumber) + return lexInitial +} + +func lexString(l *Lexer) stateFn { + escaped := false +loop: + for { + switch r := l.next(); r { + case '\\': + escaped = true + case '"': + if escaped { + escaped = false + } else { + l.emit(itemString) + break loop + } + case EOF: + return l.errorf("String hits EOF") + default: + escaped = false + } + } + + return lexInitial +} + +func isSpace(r rune) bool { + return r == ' ' || r == '\t' +} + +func isDigit(r rune) bool { + return strings.IndexRune("1234567890", r) > -1 +} diff --git a/test.json b/test.json index 125ba27..6200fd0 100644 --- a/test.json +++ b/test.json @@ -2,7 +2,8 @@ "prices": { "apple": 25, "banana": 10, - "peach": 40 + "peach": 40.5, + "pomelo": null }, "bananas": [ {"length": 13, "weight": 5},