From 4c2af09c99b08c038327317597d723ffdb6218c4 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Fri, 1 Dec 2023 00:57:56 +0100 Subject: [PATCH] Implemented lexer and parser working for all provided test cases --- go.mod | 3 + lexer/lexer.go | 111 ++++++++++++++ lexer/lexer_test.go | 45 ++++++ parser/parser.go | 222 +++++++++++++++++++++++++++ parser/parser_test.go | 306 ++++++++++++++++++++++++++++++++++++++ tests/step1/invalid.json | 0 tests/step1/valid.json | 1 + tests/step2/invalid.json | 1 + tests/step2/invalid2.json | 4 + tests/step2/valid.json | 1 + tests/step2/valid2.json | 4 + tests/step3/invalid.json | 7 + tests/step3/valid.json | 7 + tests/step4/invalid.json | 8 + tests/step4/valid.json | 6 + tests/step4/valid2.json | 8 + token/token.go | 49 ++++++ 17 files changed, 783 insertions(+) create mode 100644 go.mod create mode 100644 lexer/lexer.go create mode 100644 lexer/lexer_test.go create mode 100644 parser/parser.go create mode 100644 parser/parser_test.go create mode 100644 tests/step1/invalid.json create mode 100644 tests/step1/valid.json create mode 100644 tests/step2/invalid.json create mode 100644 tests/step2/invalid2.json create mode 100644 tests/step2/valid.json create mode 100644 tests/step2/valid2.json create mode 100644 tests/step3/invalid.json create mode 100644 tests/step3/valid.json create mode 100644 tests/step4/invalid.json create mode 100644 tests/step4/valid.json create mode 100644 tests/step4/valid2.json create mode 100644 token/token.go diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..74e9492 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module gitea.paas.celticinfo.fr/oabrivard/gojson + +go 1.21.4 diff --git a/lexer/lexer.go b/lexer/lexer.go new file mode 100644 index 0000000..f558b67 --- /dev/null +++ b/lexer/lexer.go @@ -0,0 +1,111 @@ +package lexer + +import ( + "gitea.paas.celticinfo.fr/oabrivard/gojson/token" +) + +type Lexer struct { + input string + position int + readPosition int + ch byte +} + +func NewLexer(input string) *Lexer { + l := &Lexer{input: input} + l.readChar() + return l +} + +func (l *Lexer) NextToken() token.Token { + var tok token.Token + + l.skipWhitespace() + + switch l.ch { + case '{': + tok = token.NewToken(token.BEGIN_OBJECT, l.ch) + case '}': + tok = token.NewToken(token.END_OBJECT, l.ch) + case '[': + tok = token.NewToken(token.BEGIN_ARRAY, l.ch) + case ']': + tok = token.NewToken(token.END_ARRAY, l.ch) + case ':': + tok = token.NewToken(token.NAME_SEPARATOR, l.ch) + case ',': + tok = token.NewToken(token.VALUE_SEPARATOR, l.ch) + case '"': + tok.Type = token.STRING + tok.Value = l.readString() + case 0: + tok.Value = "" + tok.Type = token.EOF + default: + if isDigit(l.ch) || l.ch == '-' { + tok.Value = l.readNumber() + tok.Type = token.NUMBER + return tok + } else if isLetter(l.ch) { + tok.Value = l.readIdentifier() + tok.Type = token.LookupIdent(tok.Value) + return tok + } else { + tok = token.NewToken(token.ILLEGAL, l.ch) + } + } + + l.readChar() + return tok +} + +func (l *Lexer) readChar() { + if l.readPosition >= len(l.input) { + l.ch = 0 + } else { + l.ch = l.input[l.readPosition] + } + l.position = l.readPosition + l.readPosition++ +} + +func (l *Lexer) skipWhitespace() { + for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' { + l.readChar() + } +} + +func (l *Lexer) readNumber() string { + position := l.position + for isDigit(l.ch) || l.ch == '.' || l.ch == '-' || l.ch == '+' || l.ch == 'e' || l.ch == 'E' { + l.readChar() + } + return l.input[position:l.position] +} + +func isDigit(ch byte) bool { + return '0' <= ch && ch <= '9' +} + +func (l *Lexer) readString() string { + position := l.position + 1 + for { + l.readChar() + if l.ch == '"' || l.ch == 0 { + break + } + } + return l.input[position:l.position] +} + +func (l *Lexer) readIdentifier() string { + position := l.position + for isLetter(l.ch) { + l.readChar() + } + return l.input[position:l.position] +} + +func isLetter(ch byte) bool { + return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ch == '_' +} diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go new file mode 100644 index 0000000..fdfa57d --- /dev/null +++ b/lexer/lexer_test.go @@ -0,0 +1,45 @@ +package lexer + +import ( + "testing" + + "gitea.paas.celticinfo.fr/oabrivard/gojson/token" +) + +func TestTokenizeSimpleObject(t *testing.T) { + input := `{"name": "John", "age": 30, "value": -3.5e+5}` + + tests := []struct { + expectedType token.TokenType + expectedLiteral string + }{ + {token.BEGIN_OBJECT, "{"}, + {token.STRING, "name"}, + {token.NAME_SEPARATOR, ":"}, + {token.STRING, "John"}, + {token.VALUE_SEPARATOR, ","}, + {token.STRING, "age"}, + {token.NAME_SEPARATOR, ":"}, + {token.NUMBER, "30"}, + {token.VALUE_SEPARATOR, ","}, + {token.STRING, "value"}, + {token.NAME_SEPARATOR, ":"}, + {token.NUMBER, "-3.5e+5"}, + {token.END_OBJECT, "}"}, + {token.EOF, ""}, + } + + l := NewLexer(input) + + for i, tt := range tests { + tok := l.NextToken() + + if tok.Type != tt.expectedType { + t.Fatalf("tests[%d] - tokentype wrong. expected=%q, got=%q", i, tt.expectedType, tok.Type) + } + + if tok.Value != tt.expectedLiteral { + t.Fatalf("tests[%d] - literal wrong. expected=%q, got=%q", i, tt.expectedLiteral, tok.Value) + } + } +} diff --git a/parser/parser.go b/parser/parser.go new file mode 100644 index 0000000..578aa18 --- /dev/null +++ b/parser/parser.go @@ -0,0 +1,222 @@ +package parser + +import ( + "errors" + "fmt" + "strconv" + "strings" + + "gitea.paas.celticinfo.fr/oabrivard/gojson/lexer" + "gitea.paas.celticinfo.fr/oabrivard/gojson/token" +) + +type Parser struct { + lexer *lexer.Lexer + + curToken token.Token + peekToken token.Token + + errors []string +} + +func NewParser(l *lexer.Lexer) *Parser { + p := &Parser{lexer: l} + // Initialize curToken and peekToken + p.nextToken() + p.nextToken() + return p +} + +func (p *Parser) nextToken() { + p.curToken = p.peekToken + p.peekToken = p.lexer.NextToken() +} + +// Methods to parse JSON structure + +type JsonObject map[string]interface{} +type JsonArray []interface{} + +func (p *Parser) Parse() JsonObject { + return p.parseObject() +} + +func (p *Parser) parseObject() JsonObject { + object := make(JsonObject) + + // Expect the current token to be TokenBeginObject + if !p.curTokenIs(token.BEGIN_OBJECT) { + p.addError("expected '{'") + return nil + } + + // Move to the next token + p.nextToken() + + // Loop until we find an end object token + for !p.curTokenIs(token.END_OBJECT) && !p.curTokenIs(token.EOF) { + key := p.parseObjectKey() + if key == "" { + return nil + } + + // Expect a name separator (:) + if !p.expectPeek(token.NAME_SEPARATOR) { + return nil + } + + // Move to the value token + p.nextToken() + + // Parse the value + value, err := p.parseValue() + if err != nil { + return nil + } + + object[key] = value + + // Move past the value, potentially to a comma or the end object + p.nextToken() + + // If we have a comma, the object continues + if p.curTokenIs(token.VALUE_SEPARATOR) { + if p.peekToken.Type == token.END_OBJECT { // no comma just before the end of the object + p.addError("No ',' before '}'") + return nil + } + + p.nextToken() + } + } + + // Expect the end object token + if !p.curTokenIs(token.END_OBJECT) { + p.addError("expected '}'") + return nil + } + + return object +} + +func (p *Parser) parseArray() JsonArray { + array := JsonArray{} + + // Expect the current token to be TokenBeginArray + if !p.curTokenIs(token.BEGIN_ARRAY) { + p.addError("expected '['") + return nil + } + + // Move to the next token + p.nextToken() + + // Loop until we find an end array token + for !p.curTokenIs(token.END_ARRAY) { + // Parse the value + value, err := p.parseValue() + if err != nil { + return nil + } + + array = append(array, value) + + // Move past the value + p.nextToken() + + // If we have a value separator (comma), continue to the next value + if p.curTokenIs(token.VALUE_SEPARATOR) { + p.nextToken() + } + } + + // Expect the end array token + if !p.curTokenIs(token.END_ARRAY) { + return nil + } + + return array +} + +func (p *Parser) addError(msg string) { + p.errors = append(p.errors, msg) +} + +func (p *Parser) parseObjectKey() string { + if p.curToken.Type != token.STRING { + p.addError("expected string for key") + return "" + } + return p.curToken.Value +} + +func (p *Parser) parseValue() (interface{}, error) { + switch p.curToken.Type { + case token.STRING: + return p.curToken.Value, nil + case token.NUMBER: + return p.parseNumber(), nil + case token.TRUE, token.FALSE: + return p.parseBoolean(), nil + case token.NULL: + return nil, nil + case token.BEGIN_OBJECT: + return p.parseObject(), nil + case token.BEGIN_ARRAY: + return p.parseArray(), nil + // ... other cases + default: + p.addError("unexpected token") + return nil, errors.New("unexpected token") + } +} + +func (p *Parser) parseNumber() interface{} { + // Assuming the number is in a string format in the token + numStr := p.curToken.Value + + // Check if the number is an integer or a float + if strings.Contains(numStr, ".") || strings.ContainsAny(numStr, "eE") { + // Parse as float + val, err := strconv.ParseFloat(numStr, 64) + if err != nil { + p.addError(fmt.Sprintf("could not parse %q as float", numStr)) + return nil + } + return val + } + + // Parse as integer + val, err := strconv.ParseInt(numStr, 10, 64) + if err != nil { + p.addError(fmt.Sprintf("could not parse %q as integer", numStr)) + return nil + } + return val +} + +func (p *Parser) parseBoolean() bool { + return p.curToken.Type == token.TRUE +} + +func (p *Parser) expectPeek(t token.TokenType) bool { + if p.peekToken.Type == t { + p.nextToken() + return true + } else { + p.addError(fmt.Sprintf("expected next token to be %v, got %v instead", t, p.peekToken.Type)) + return false + } +} + +func (p *Parser) curTokenIs(t token.TokenType) bool { + return p.curToken.Type == t +} + +/* +func (p *Parser) parseArray() *JsonArray { + // Implementation for parsing an array +} +*/ + +// ... other parse methods for different types diff --git a/parser/parser_test.go b/parser/parser_test.go new file mode 100644 index 0000000..7fcf47c --- /dev/null +++ b/parser/parser_test.go @@ -0,0 +1,306 @@ +package parser + +import ( + "reflect" + "testing" + + "gitea.paas.celticinfo.fr/oabrivard/gojson/lexer" +) + +func TestParseSimpleObject(t *testing.T) { + input := `{"name": "John", "age": 30, "isStudent": false}` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 0 { + errMsg := "" + for _, s := range p.errors { + errMsg += s + "\n" + } + t.Fatalf(errMsg) + } + + expected := JsonObject{ + "name": "John", + "age": int64(30), // Assuming numbers are parsed as float64 + "isStudent": false, + } + + if !reflect.DeepEqual(parsed, expected) { + t.Errorf("parsed object is not as expected. Got %+v, want %+v", parsed, expected) + } +} + +func TestParseStep1Valid(t *testing.T) { + input := `{}` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 0 { + errMsg := "" + for _, s := range p.errors { + errMsg += s + "\n" + } + t.Fatalf(errMsg) + } + + expected := JsonObject{} + + if !reflect.DeepEqual(parsed, expected) { + t.Errorf("parsed object is not as expected. Got %+v, want %+v", parsed, expected) + } +} + +func TestParseStep1Invalid(t *testing.T) { + input := `` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 1 || p.errors[0] != "expected '{'" { + t.Errorf("Not the expected error(s) during parsing, got %v", p.errors) + } + + if parsed != nil { + t.Errorf("expected a nil result from parsing an empty input") + } +} + +func TestParseStep2Valid1(t *testing.T) { + input := `{"key": "value"}` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 0 { + errMsg := "" + for _, s := range p.errors { + errMsg += s + "\n" + } + t.Fatalf(errMsg) + } + + expected := JsonObject{ + "key": "value", + } + + if !reflect.DeepEqual(parsed, expected) { + t.Errorf("parsed object is not as expected. Got %+v, want %+v", parsed, expected) + } +} + +func TestParseStep2Valid2(t *testing.T) { + input := `{ + "key": "value", + "key2": "value" + }` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 0 { + errMsg := "" + for _, s := range p.errors { + errMsg += s + "\n" + } + t.Fatalf(errMsg) + } + + expected := JsonObject{ + "key": "value", + "key2": "value", + } + + if !reflect.DeepEqual(parsed, expected) { + t.Errorf("parsed object is not as expected. Got %+v, want %+v", parsed, expected) + } +} + +func TestParseStep2Invalid1(t *testing.T) { + input := `{"key": "value",}` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 1 || p.errors[0] != "No ',' before '}'" { + t.Errorf("Not the expected error(s) during parsing, got %v", p.errors) + } + + if parsed != nil { + t.Errorf("expected a nil result from parsing an empty input") + } +} + +func TestParseStep2Invalid2(t *testing.T) { + input := `{ + "key": "value", + key2: "value" + }` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 1 || p.errors[0] != "expected string for key" { + t.Errorf("Not the expected error(s) during parsing, got %v", p.errors) + } + + if parsed != nil { + t.Errorf("expected a nil result from parsing an empty input") + } +} + +func TestParseStep3Valid(t *testing.T) { + input := `{ + "key1": true, + "key2": false, + "key3": null, + "key4": "value", + "key5": 101 + }` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 0 { + errMsg := "" + for _, s := range p.errors { + errMsg += s + "\n" + } + t.Fatalf(errMsg) + } + + expected := JsonObject{ + "key1": true, + "key2": false, + "key3": nil, + "key4": "value", + "key5": int64(101), + } + + if !reflect.DeepEqual(parsed, expected) { + t.Errorf("parsed object is not as expected. Got %+v, want %+v", parsed, expected) + } +} + +func TestParseStep3Invalid(t *testing.T) { + input := `{ + "key1": true, + "key2": False, + "key3": null, + "key4": "value", + "key5": 101 + }` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 1 || p.errors[0] != "unexpected token" { + t.Errorf("Not the expected error(s) during parsing, got %v", p.errors) + } + + if parsed != nil { + t.Errorf("expected a nil result from parsing an empty input") + } +} + +func TestParseStep4Valid1(t *testing.T) { + input := `{ + "key": "value", + "key-n": 101, + "key-o": {}, + "key-l": [] + }` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 0 { + errMsg := "" + for _, s := range p.errors { + errMsg += s + "\n" + } + t.Fatalf(errMsg) + } + + expected := JsonObject{ + "key": "value", + "key-n": int64(101), + "key-o": JsonObject{}, + "key-l": JsonArray{}, + } + + if !reflect.DeepEqual(parsed, expected) { + t.Errorf("parsed object is not as expected. Got %+v, want %+v", parsed, expected) + } +} + +func TestParseStep4Valid2(t *testing.T) { + input := `{ + "key": "value", + "key-n": 101, + "key-o": { + "inner key": "inner value" + }, + "key-l": ["list value"] + }` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 0 { + errMsg := "" + for _, s := range p.errors { + errMsg += s + "\n" + } + t.Fatalf(errMsg) + } + + expected := JsonObject{ + "key": "value", + "key-n": int64(101), + "key-o": JsonObject{ + "inner key": "inner value", + }, + "key-l": JsonArray{"list value"}, + } + + if !reflect.DeepEqual(parsed, expected) { + t.Errorf("parsed object is not as expected. Got %+v, want %+v", parsed, expected) + } +} + +func TestParseStep4Invalid(t *testing.T) { + input := `{ + "key": "value", + "key-n": 101, + "key-o": { + "inner key": "inner value" + }, + "key-l": ['list value'] + }` + + l := lexer.NewLexer(input) + p := NewParser(l) + parsed := p.Parse() + + if len(p.errors) != 2 || p.errors[0] != "unexpected token" || p.errors[1] != "expected string for key" { + t.Errorf("Not the expected error(s) during parsing, got %v", p.errors) + } + + if parsed != nil { + t.Errorf("expected a nil result from parsing an empty input") + } +} diff --git a/tests/step1/invalid.json b/tests/step1/invalid.json new file mode 100644 index 0000000..e69de29 diff --git a/tests/step1/valid.json b/tests/step1/valid.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/tests/step1/valid.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/tests/step2/invalid.json b/tests/step2/invalid.json new file mode 100644 index 0000000..d7e32b8 --- /dev/null +++ b/tests/step2/invalid.json @@ -0,0 +1 @@ +{"key": "value",} \ No newline at end of file diff --git a/tests/step2/invalid2.json b/tests/step2/invalid2.json new file mode 100644 index 0000000..eff13a5 --- /dev/null +++ b/tests/step2/invalid2.json @@ -0,0 +1,4 @@ +{ + "key": "value", + key2: "value" +} \ No newline at end of file diff --git a/tests/step2/valid.json b/tests/step2/valid.json new file mode 100644 index 0000000..76519fa --- /dev/null +++ b/tests/step2/valid.json @@ -0,0 +1 @@ +{"key": "value"} diff --git a/tests/step2/valid2.json b/tests/step2/valid2.json new file mode 100644 index 0000000..3c88601 --- /dev/null +++ b/tests/step2/valid2.json @@ -0,0 +1,4 @@ +{ + "key": "value", + "key2": "value" +} \ No newline at end of file diff --git a/tests/step3/invalid.json b/tests/step3/invalid.json new file mode 100644 index 0000000..94d2214 --- /dev/null +++ b/tests/step3/invalid.json @@ -0,0 +1,7 @@ +{ + "key1": true, + "key2": False, + "key3": null, + "key4": "value", + "key5": 101 +} \ No newline at end of file diff --git a/tests/step3/valid.json b/tests/step3/valid.json new file mode 100644 index 0000000..6f99bea --- /dev/null +++ b/tests/step3/valid.json @@ -0,0 +1,7 @@ +{ + "key1": true, + "key2": false, + "key3": null, + "key4": "value", + "key5": 101 +} \ No newline at end of file diff --git a/tests/step4/invalid.json b/tests/step4/invalid.json new file mode 100644 index 0000000..304c553 --- /dev/null +++ b/tests/step4/invalid.json @@ -0,0 +1,8 @@ +{ + "key": "value", + "key-n": 101, + "key-o": { + "inner key": "inner value" + }, + "key-l": ['list value'] +} \ No newline at end of file diff --git a/tests/step4/valid.json b/tests/step4/valid.json new file mode 100644 index 0000000..0299c4e --- /dev/null +++ b/tests/step4/valid.json @@ -0,0 +1,6 @@ +{ + "key": "value", + "key-n": 101, + "key-o": {}, + "key-l": [] +} \ No newline at end of file diff --git a/tests/step4/valid2.json b/tests/step4/valid2.json new file mode 100644 index 0000000..0fdd8fb --- /dev/null +++ b/tests/step4/valid2.json @@ -0,0 +1,8 @@ +{ + "key": "value", + "key-n": 101, + "key-o": { + "inner key": "inner value" + }, + "key-l": ["list value"] +} \ No newline at end of file diff --git a/token/token.go b/token/token.go new file mode 100644 index 0000000..4c1da34 --- /dev/null +++ b/token/token.go @@ -0,0 +1,49 @@ +package token + +type TokenType int + +const ( + // Special tokens + EOF TokenType = iota // Represents the end of the file/input + ILLEGAL // Represents any character or sequence of characters that doesn't form a valid token in JSON + + // Symbols and structure tokens + BEGIN_ARRAY // [ + END_ARRAY // ] + BEGIN_OBJECT // { + END_OBJECT // } + NAME_SEPARATOR // : + VALUE_SEPARATOR // , + + // Whitespace + WHITESPACE // Represents whitespace (spaces, tabs, line feeds, carriage returns). + + // Literal types + STRING // Represents a string literal + NUMBER // Represents a number + TRUE // Represents the boolean value "true" + FALSE // Represents the boolean value "false" + NULL // Represents the "null" value +) + +type Token struct { + Type TokenType + Value string +} + +func NewToken(tokenType TokenType, ch byte) Token { + return Token{Type: tokenType, Value: string(ch)} +} + +var keywords = map[string]TokenType{ + "true": TRUE, + "false": FALSE, + "null": NULL, +} + +func LookupIdent(ident string) TokenType { + if tok, ok := keywords[ident]; ok { + return tok + } + return ILLEGAL +}