From 4db842b082d4853237842a1fff5112d8d705b98f Mon Sep 17 00:00:00 2001 From: Jens Neuse Date: Mon, 2 Mar 2026 23:47:25 +0100 Subject: [PATCH 1/4] feat: extreme performance optimizations for arena parsing and MergeValues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 1 optimizations: - Character lookup table (charFlags) for parseRawString/skipWS - Type shrunk from int to uint8 for cache-friendly structures - Sentinel errors replacing fmt.Errorf in hot paths - Pre-sized object/array slices with branch prediction hints - Optimized unescapeStringBestEffort with arena buffer reuse - Hex digit lookup table replacing conditional chains - Lazy hash map for Object.Get on large objects (>16 keys) - Eliminated strings.EqualFold in parseRawNumber Round 2 optimizations (pprof-driven): - Slab allocation for Value/kv via parseContext (growing 8→64), reducing arena allocation overhead by batching Alloc calls - parseRawKey using strings.IndexByte (SIMD) instead of byte-by-byte - MergeValues: raw string fast path for number comparison (avoids fastfloat.Parse), direct field comparison for strings, cached type reads with deferred boolean computation Arena parsing: -13% to -49% (canada.json nearly 2x faster) Arena throughput: up to 1.5 GB/s (twitter), 991 MB/s (20mb) MergeValues: -4% to -39% depending on value types Arena memory: -20% to -49% bytes/op on large inputs Co-Authored-By: Claude Opus 4.6 --- benchmark_comprehensive_test.go | 1239 +++++++++++++++++++++++++++++++ chartable.go | 64 ++ fastfloat/parse.go | 42 +- mergevalues.go | 47 +- parser.go | 376 ++++++---- parser_test.go | 4 +- scanner.go | 3 +- update.go | 2 + validate.go | 92 ++- 9 files changed, 1649 insertions(+), 220 deletions(-) create mode 100644 benchmark_comprehensive_test.go create mode 100644 chartable.go diff --git a/benchmark_comprehensive_test.go b/benchmark_comprehensive_test.go new file mode 100644 index 0000000..a4d517e --- /dev/null +++ b/benchmark_comprehensive_test.go @@ -0,0 +1,1239 @@ +package astjson + +import ( + "strings" + "testing" + + "github.com/wundergraph/go-arena" +) + +// Sink vars prevent dead-code elimination by the compiler. +var ( + sinkValue *Value + sinkBytes []byte + sinkString string + sinkInt int + sinkFloat64 float64 + sinkBool bool + sinkErr error +) + +// fixture pairs a name with its JSON data for table-driven benchmarks. +type fixture struct { + name string + data string +} + +// fixtures is the default set (excludes 20mb for speed). +var fixtures = []fixture{ + {"small", smallFixture}, + {"medium", mediumFixture}, + {"large", largeFixture}, + {"canada", canadaFixture}, + {"citm", citmFixture}, + {"twitter", twitterFixture}, +} + +// bunchFieldsFixture is an 871-key object for large-object benchmarks. +var bunchFieldsFixture = getFromFile("testdata/bunchFields.json") + +// --------------------------------------------------------------------------- +// Section 1: Parsing +// --------------------------------------------------------------------------- + +func BenchmarkSTParse(b *testing.B) { + for _, f := range fixtures { + b.Run(f.name, func(b *testing.B) { + benchmarkSTParse(b, f.data) + }) + } + b.Run("20mb", func(b *testing.B) { + benchmarkSTParse(b, huge20MbFixture) + }) +} + +func benchmarkSTParse(b *testing.B, data string) { + var p Parser + b.ReportAllocs() + b.SetBytes(int64(len(data))) + b.ResetTimer() + for b.Loop() { + v, err := p.Parse(data) + if err != nil { + b.Fatal(err) + } + sinkValue = v + } +} + +func BenchmarkSTParseArena(b *testing.B) { + for _, f := range fixtures { + b.Run(f.name, func(b *testing.B) { + benchmarkSTParseArena(b, f.data, 2*1024*1024) + }) + } + b.Run("20mb", func(b *testing.B) { + benchmarkSTParseArena(b, huge20MbFixture, 32*1024*1024) + }) +} + +func benchmarkSTParseArena(b *testing.B, data string, arenaSize int) { + var p Parser + a := arena.NewMonotonicArena(arena.WithMinBufferSize(arenaSize)) + b.ReportAllocs() + b.SetBytes(int64(len(data))) + b.ResetTimer() + for b.Loop() { + v, err := p.ParseWithArena(a, data) + if err != nil { + b.Fatal(err) + } + sinkValue = v + a.Reset() + } +} + +func BenchmarkSTParseBytes(b *testing.B) { + for _, name := range []string{"small", "medium", "large", "twitter"} { + var data string + for _, f := range fixtures { + if f.name == name { + data = f.data + break + } + } + b.Run(name, func(b *testing.B) { + var p Parser + bb := []byte(data) + b.ReportAllocs() + b.SetBytes(int64(len(bb))) + b.ResetTimer() + for b.Loop() { + v, err := p.ParseBytes(bb) + if err != nil { + b.Fatal(err) + } + sinkValue = v + } + }) + } +} + +func BenchmarkSTParseBytesArena(b *testing.B) { + for _, name := range []string{"small", "medium", "large", "twitter"} { + var data string + for _, f := range fixtures { + if f.name == name { + data = f.data + break + } + } + b.Run(name, func(b *testing.B) { + var p Parser + a := arena.NewMonotonicArena(arena.WithMinBufferSize(2 * 1024 * 1024)) + bb := []byte(data) + b.ReportAllocs() + b.SetBytes(int64(len(bb))) + b.ResetTimer() + for b.Loop() { + v, err := p.ParseBytesWithArena(a, bb) + if err != nil { + b.Fatal(err) + } + sinkValue = v + a.Reset() + } + }) + } +} + +func BenchmarkSTParseRawString(b *testing.B) { + cases := []struct { + name string + s string // includes opening quote already stripped + }{ + {"empty", `"`}, + {"short", `hello"`}, + {"medium", `abcdefghijklmnopqrstuvwxyz012345678901234567890123"`}, + {"with_escape", `hello\"world\\nfoo"`}, + {"unicode", `\u0048\u0065\u006C\u006C\u006F"`}, + } + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(len(tc.s))) + b.ResetTimer() + for b.Loop() { + rs, _, err := parseRawString(tc.s) + if err != nil { + b.Fatal(err) + } + sinkString = rs + } + }) + } +} + +func BenchmarkSTParseRawNumber(b *testing.B) { + cases := []struct { + name string + s string + }{ + {"int", "12345,"}, + {"float", "123.456,"}, + {"exp", "123.456e+78,"}, + {"negative", "-12345.6789,"}, + } + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(len(tc.s))) + b.ResetTimer() + for b.Loop() { + rn, _, err := parseRawNumber(tc.s) + if err != nil { + b.Fatal(err) + } + sinkString = rn + } + }) + } +} + +func BenchmarkSTParseRawKey(b *testing.B) { + cases := []struct { + name string + s string // after the opening quote + }{ + {"simple", `username"`}, + {"long", strings.Repeat("a", 100) + `"`}, + {"with_escape", `user\"name"`}, + } + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(len(tc.s))) + b.ResetTimer() + for b.Loop() { + k, _, err := parseRawKey(tc.s) + if err != nil { + b.Fatal(err) + } + sinkString = k + } + }) + } +} + +func BenchmarkSTSkipWS(b *testing.B) { + cases := []struct { + name string + s string + }{ + {"none", `{"key": 1}`}, + {"short", ` {"key": 1}`}, + {"long", strings.Repeat(" ", 256) + `{"key": 1}`}, + } + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(len(tc.s))) + b.ResetTimer() + for b.Loop() { + sinkString = skipWS(tc.s) + } + }) + } +} + +func BenchmarkSTUnescapeStringBestEffort(b *testing.B) { + cases := []struct { + name string + s string + }{ + {"no_escape", "hello world plain text"}, + {"simple_escape", `hello\nworld\ttab`}, + {"unicode_escape", `\u0048\u0065\u006C\u006C\u006F`}, + {"surrogate_pair", `\uD83D\uDE00 smile`}, + } + for _, tc := range cases { + b.Run("heap/"+tc.name, func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(len(tc.s))) + b.ResetTimer() + for b.Loop() { + sinkString = unescapeStringBestEffort(nil, tc.s) + } + }) + b.Run("arena/"+tc.name, func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + b.ReportAllocs() + b.SetBytes(int64(len(tc.s))) + b.ResetTimer() + for b.Loop() { + sinkString = unescapeStringBestEffort(a, tc.s) + a.Reset() + } + }) + } +} + +func BenchmarkSTEscapeString(b *testing.B) { + cases := []struct { + name string + s string + }{ + {"no_special", "hello world plain text no special chars here"}, + {"with_quotes", `hello "world" said "foo"`}, + {"with_control", "hello\nworld\ttab\rreturn"}, + {"mixed", "he said \"hi\"\nbye\\done"}, + } + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + dst := make([]byte, 0, 256) + b.ReportAllocs() + b.SetBytes(int64(len(tc.s))) + b.ResetTimer() + for b.Loop() { + sinkBytes = escapeString(dst[:0], tc.s) + } + }) + } +} + +func BenchmarkSTHasSpecialChars(b *testing.B) { + plain100 := strings.Repeat("abcdefghij", 10) + cases := []struct { + name string + s string + }{ + {"none_short", "hello"}, + {"none_long", plain100}, + {"early_hit", `ab"cd`}, + {"late_hit", plain100[:99] + `"`}, + } + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(len(tc.s))) + b.ResetTimer() + for b.Loop() { + sinkBool = hasSpecialChars(tc.s) + } + }) + } +} + +func BenchmarkSTParseInlineSmall(b *testing.B) { + cases := []struct { + name string + s string + }{ + {"null", "null"}, + {"true", "true"}, + {"false", "false"}, + {"number", "12345"}, + {"string", `"hello world"`}, + {"empty_object", "{}"}, + {"empty_array", "[]"}, + {"small_object", `{"a":1,"b":"x","c":true}`}, + } + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + var p Parser + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, err := p.Parse(tc.s) + if err != nil { + b.Fatal(err) + } + sinkValue = v + } + }) + } +} + +func BenchmarkSTParseDeepNesting(b *testing.B) { + // Build 100-level nested object: {"a":{"a":{"a":...1...}}} + depth := 100 + s := "1" + for range depth { + s = `{"a":` + s + `}` + } + var p Parser + b.ReportAllocs() + b.SetBytes(int64(len(s))) + b.ResetTimer() + for b.Loop() { + v, err := p.Parse(s) + if err != nil { + b.Fatal(err) + } + sinkValue = v + } +} + +// --------------------------------------------------------------------------- +// Section 2: Value Access +// --------------------------------------------------------------------------- + +func BenchmarkSTValueGet(b *testing.B) { + var p Parser + v, err := p.Parse(twitterFixture) + if err != nil { + b.Fatal(err) + } + + cases := []struct { + name string + keys []string + }{ + {"shallow", []string{"statuses"}}, + {"deep_2", []string{"statuses", "0"}}, + {"deep_3", []string{"statuses", "0", "user"}}, + {"miss", []string{"nonexistent"}}, + } + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + sinkValue = v.Get(tc.keys...) + } + }) + } +} + +func BenchmarkSTObjectGet(b *testing.B) { + var p Parser + + // Small object: twitter statuses[0] (~25 keys) + tv, err := p.Parse(twitterFixture) + if err != nil { + b.Fatal(err) + } + smallObj := tv.Get("statuses", "0").GetObject() + + // Large object: bunchFields (871 keys) + bv, err := p.Parse(bunchFieldsFixture) + if err != nil { + b.Fatal(err) + } + largeObj := bv.GetObject() + + b.Run("small_hit", func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + sinkValue = smallObj.Get("user") + } + }) + b.Run("small_miss", func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + sinkValue = smallObj.Get("nonexistent_key_xyz") + } + }) + b.Run("large_first", func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + sinkValue = largeObj.Get("4") + } + }) + b.Run("large_last", func(b *testing.B) { + // Get the last key + var lastKey string + largeObj.Visit(func(key []byte, v *Value) { + lastKey = string(key) + }) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + sinkValue = largeObj.Get(lastKey) + } + }) + b.Run("large_miss", func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + sinkValue = largeObj.Get("nonexistent_key_xyz") + } + }) +} + +func BenchmarkSTObjectVisit(b *testing.B) { + var p Parser + + tv, err := p.Parse(twitterFixture) + if err != nil { + b.Fatal(err) + } + obj := tv.Get("statuses", "0").GetObject() + + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + obj.Visit(func(key []byte, v *Value) { + sinkBytes = key + }) + } +} + +func BenchmarkSTGetStringBytes(b *testing.B) { + var p Parser + v, _ := p.Parse(`{"name":"hello world"}`) + sv := v.Get("name") + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + sb, err := sv.StringBytes() + if err != nil { + b.Fatal(err) + } + sinkBytes = sb + } +} + +func BenchmarkSTGetInt(b *testing.B) { + var p Parser + v, _ := p.Parse(`{"count":12345}`) + nv := v.Get("count") + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + n, err := nv.Int() + if err != nil { + b.Fatal(err) + } + sinkInt = n + } +} + +func BenchmarkSTGetFloat64(b *testing.B) { + var p Parser + v, _ := p.Parse(`{"price":123.456}`) + fv := v.Get("price") + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + f, err := fv.Float64() + if err != nil { + b.Fatal(err) + } + sinkFloat64 = f + } +} + +func BenchmarkSTGetBool(b *testing.B) { + var p Parser + v, _ := p.Parse(`{"active":true}`) + bv := v.Get("active") + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + sinkBool = bv.GetBool() + } +} + +// --------------------------------------------------------------------------- +// Section 3: Merging +// --------------------------------------------------------------------------- + +func BenchmarkSTMergeValuesObject(b *testing.B) { + b.Run("small", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + aVal, _ := p.ParseWithArena(a, `{"x":1,"y":2,"z":3}`) + bVal, _ := p.ParseWithArena(a, `{"y":20,"w":4}`) + aBytes := []byte(aVal.String()) + bBytes := []byte(bVal.String()) + a.Reset() + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + av, _ := p.ParseBytesWithArena(a, aBytes) + bv, _ := p.ParseBytesWithArena(a, bBytes) + v, _, err := MergeValues(a, av, bv) + if err != nil { + b.Fatal(err) + } + sinkValue = v + a.Reset() + } + }) + b.Run("medium", func(b *testing.B) { + // Build two 10-key objects with 3 overlapping keys + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + aJSON := `{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9,"j":10}` + bJSON := `{"h":80,"i":90,"j":100,"k":11,"l":12}` + aBytes := []byte(aJSON) + bBytes := []byte(bJSON) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + av, _ := p.ParseBytesWithArena(a, aBytes) + bv, _ := p.ParseBytesWithArena(a, bBytes) + v, _, err := MergeValues(a, av, bv) + if err != nil { + b.Fatal(err) + } + sinkValue = v + a.Reset() + } + }) + b.Run("large", func(b *testing.B) { + // Merge two copies of twitter statuses[0] (realistic large object merge) + a := arena.NewMonotonicArena(arena.WithMinBufferSize(2 * 1024 * 1024)) + var p Parser + tv, _ := p.Parse(twitterFixture) + obj := tv.Get("statuses", "0") + objJSON := obj.String() + objBytes := []byte(objJSON) + b.ReportAllocs() + b.SetBytes(int64(len(objBytes))) + b.ResetTimer() + for b.Loop() { + av, _ := p.ParseBytesWithArena(a, objBytes) + bv, _ := p.ParseBytesWithArena(a, objBytes) + v, _, err := MergeValues(a, av, bv) + if err != nil { + b.Fatal(err) + } + sinkValue = v + a.Reset() + } + }) +} + +func BenchmarkSTMergeValuesArray(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + aBytes := []byte(`[1,2,3,4,5,6,7,8,9,10]`) + bBytes := []byte(`[11,12,13,14,15,16,17,18,19,20]`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + av, _ := p.ParseBytesWithArena(a, aBytes) + bv, _ := p.ParseBytesWithArena(a, bBytes) + v, _, err := MergeValues(a, av, bv) + if err != nil { + b.Fatal(err) + } + sinkValue = v + a.Reset() + } +} + +func BenchmarkSTMergeValuesScalar(b *testing.B) { + b.Run("string", func(b *testing.B) { + var p Parser + aVal, _ := p.Parse(`"hello"`) + bVal, _ := p.Parse(`"world"`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _, err := MergeValues(nil, aVal, bVal) + if err != nil { + b.Fatal(err) + } + sinkValue = v + } + }) + b.Run("number", func(b *testing.B) { + var p Parser + aVal, _ := p.Parse(`123`) + bVal, _ := p.Parse(`456`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _, err := MergeValues(nil, aVal, bVal) + if err != nil { + b.Fatal(err) + } + sinkValue = v + } + }) + b.Run("bool", func(b *testing.B) { + var p Parser + aVal, _ := p.Parse(`true`) + bVal, _ := p.Parse(`false`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _, err := MergeValues(nil, aVal, bVal) + if err != nil { + b.Fatal(err) + } + sinkValue = v + } + }) +} + +func BenchmarkSTMergeValuesWithPath(b *testing.B) { + b.Run("depth_1", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + aBytes := []byte(`{"data":"old_value"}`) + bBytes := []byte(`"new_value"`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + av, _ := p.ParseBytesWithArena(a, aBytes) + bv, _ := p.ParseBytesWithArena(a, bBytes) + v, _, err := MergeValuesWithPath(a, av, bv, "data") + if err != nil { + b.Fatal(err) + } + sinkValue = v + a.Reset() + } + }) + b.Run("depth_3", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + aBytes := []byte(`{"data":{"user":{"name":"old"}}}`) + bBytes := []byte(`"new_name"`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + av, _ := p.ParseBytesWithArena(a, aBytes) + bv, _ := p.ParseBytesWithArena(a, bBytes) + v, _, err := MergeValuesWithPath(a, av, bv, "data", "user", "name") + if err != nil { + b.Fatal(err) + } + sinkValue = v + a.Reset() + } + }) +} + +func BenchmarkSTMergeValuesNested(b *testing.B) { + // 5 levels of nesting with overlapping keys at each level + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + aBytes := []byte(`{"l1":{"l2":{"l3":{"l4":{"l5":"a_val"},"x":1},"y":2},"z":3},"w":4}`) + bBytes := []byte(`{"l1":{"l2":{"l3":{"l4":{"l5":"b_val"},"x":10},"y":20},"z":30},"w":40}`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + av, _ := p.ParseBytesWithArena(a, aBytes) + bv, _ := p.ParseBytesWithArena(a, bBytes) + v, _, err := MergeValues(a, av, bv) + if err != nil { + b.Fatal(err) + } + sinkValue = v + a.Reset() + } +} + +// --------------------------------------------------------------------------- +// Section 4: Value Creation +// --------------------------------------------------------------------------- + +func BenchmarkSTValueCreationHeap(b *testing.B) { + b.Run("string", func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + sinkValue = StringValue(nil, "hello world") + } + }) + b.Run("int", func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + sinkValue = IntValue(nil, 12345) + } + }) + b.Run("float", func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + sinkValue = FloatValue(nil, 123.456) + } + }) + b.Run("true", func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + sinkValue = TrueValue(nil) + } + }) + b.Run("false", func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + sinkValue = FalseValue(nil) + } + }) + b.Run("object", func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + sinkValue = ObjectValue(nil) + } + }) + b.Run("array", func(b *testing.B) { + b.ReportAllocs() + for b.Loop() { + sinkValue = ArrayValue(nil) + } + }) +} + +func BenchmarkSTValueCreationArena(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(1024 * 1024)) + b.Run("string", func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + sinkValue = StringValue(a, "hello world") + a.Reset() + } + }) + b.Run("int", func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + sinkValue = IntValue(a, 12345) + a.Reset() + } + }) + b.Run("float", func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + sinkValue = FloatValue(a, 123.456) + a.Reset() + } + }) + b.Run("true", func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + sinkValue = TrueValue(a) + a.Reset() + } + }) + b.Run("false", func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + sinkValue = FalseValue(a) + a.Reset() + } + }) + b.Run("object", func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + sinkValue = ObjectValue(a) + a.Reset() + } + }) + b.Run("array", func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + sinkValue = ArrayValue(a) + a.Reset() + } + }) +} + +// --------------------------------------------------------------------------- +// Section 5: Mutation +// --------------------------------------------------------------------------- + +func BenchmarkSTObjectSet(b *testing.B) { + b.Run("new_key", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + base := []byte(`{"a":1,"b":2,"c":3}`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.ParseBytesWithArena(a, base) + v.Set(a, "d", IntValue(a, 4)) + sinkValue = v + a.Reset() + } + }) + b.Run("existing_key", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + base := []byte(`{"a":1,"b":2,"c":3}`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.ParseBytesWithArena(a, base) + v.Set(a, "b", IntValue(a, 20)) + sinkValue = v + a.Reset() + } + }) +} + +func BenchmarkSTObjectDel(b *testing.B) { + b.Run("first_key", func(b *testing.B) { + var p Parser + base := `{"a":1,"b":2,"c":3,"d":4,"e":5}` + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.Parse(base) + v.Del("a") + sinkValue = v + } + }) + b.Run("last_key", func(b *testing.B) { + var p Parser + base := `{"a":1,"b":2,"c":3,"d":4,"e":5}` + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.Parse(base) + v.Del("e") + sinkValue = v + } + }) + b.Run("miss", func(b *testing.B) { + var p Parser + base := `{"a":1,"b":2,"c":3,"d":4,"e":5}` + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.Parse(base) + v.Del("nonexistent") + sinkValue = v + } + }) +} + +func BenchmarkSTSetArrayItem(b *testing.B) { + b.Run("replace", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + base := []byte(`[1,2,3,4,5]`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.ParseBytesWithArena(a, base) + v.SetArrayItem(a, 2, IntValue(a, 30)) + sinkValue = v + a.Reset() + } + }) + b.Run("append", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + base := []byte(`[1,2,3,4,5]`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.ParseBytesWithArena(a, base) + v.SetArrayItem(a, 5, IntValue(a, 6)) + sinkValue = v + a.Reset() + } + }) +} + +func BenchmarkSTAppendToArray(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + base := []byte(`[1,2,3,4,5,6,7,8,9,10]`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.ParseBytesWithArena(a, base) + AppendToArray(a, v, IntValue(a, 11)) + sinkValue = v + a.Reset() + } +} + +// --------------------------------------------------------------------------- +// Section 6: Serialization +// --------------------------------------------------------------------------- + +func BenchmarkSTMarshalTo(b *testing.B) { + var p Parser + for _, f := range fixtures { + b.Run(f.name, func(b *testing.B) { + v, err := p.Parse(f.data) + if err != nil { + b.Fatal(err) + } + dst := make([]byte, 0, len(f.data)) + b.ReportAllocs() + b.SetBytes(int64(len(f.data))) + b.ResetTimer() + for b.Loop() { + sinkBytes = v.MarshalTo(dst[:0]) + } + }) + } + b.Run("20mb", func(b *testing.B) { + v, err := p.Parse(huge20MbFixture) + if err != nil { + b.Fatal(err) + } + dst := make([]byte, 0, len(huge20MbFixture)) + b.ReportAllocs() + b.SetBytes(int64(len(huge20MbFixture))) + b.ResetTimer() + for b.Loop() { + sinkBytes = v.MarshalTo(dst[:0]) + } + }) +} + +func BenchmarkSTMarshalToArena(b *testing.B) { + for _, f := range fixtures { + b.Run(f.name, func(b *testing.B) { + var p Parser + a := arena.NewMonotonicArena(arena.WithMinBufferSize(2 * 1024 * 1024)) + dst := make([]byte, 0, len(f.data)) + b.ReportAllocs() + b.SetBytes(int64(len(f.data))) + b.ResetTimer() + for b.Loop() { + v, err := p.ParseWithArena(a, f.data) + if err != nil { + b.Fatal(err) + } + sinkBytes = v.MarshalTo(dst[:0]) + a.Reset() + } + }) + } +} + +// --------------------------------------------------------------------------- +// Section 7: Utilities +// --------------------------------------------------------------------------- + +func BenchmarkSTDeepCopy(b *testing.B) { + subsets := []struct { + name string + data string + }{ + {"small", smallFixture}, + {"medium", mediumFixture}, + {"twitter", twitterFixture}, + } + for _, s := range subsets { + b.Run(s.name, func(b *testing.B) { + var p Parser + v, err := p.Parse(s.data) + if err != nil { + b.Fatal(err) + } + a := arena.NewMonotonicArena(arena.WithMinBufferSize(2 * 1024 * 1024)) + b.ReportAllocs() + b.SetBytes(int64(len(s.data))) + b.ResetTimer() + for b.Loop() { + sinkValue = DeepCopy(a, v) + a.Reset() + } + }) + } +} + +func BenchmarkSTDeduplicateObjectKeys(b *testing.B) { + b.Run("small", func(b *testing.B) { + var p Parser + data := `{"a":1,"b":2,"a":3,"c":4,"b":5}` + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.Parse(data) + DeduplicateObjectKeysRecursively(v) + sinkValue = v + } + }) + b.Run("large", func(b *testing.B) { + // Use bunchFields fixture (871 keys, all unique — tests the scan cost) + var p Parser + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.Parse(bunchFieldsFixture) + DeduplicateObjectKeysRecursively(v) + sinkValue = v + } + }) +} + +func BenchmarkSTSetValue(b *testing.B) { + b.Run("existing_path", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + base := []byte(`{"data":{"user":{"name":"old"}}}`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.ParseBytesWithArena(a, base) + SetValue(a, v, StringValue(a, "new"), "data", "user", "name") + sinkValue = v + a.Reset() + } + }) + b.Run("new_path_depth2", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + base := []byte(`{"data":{}}`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.ParseBytesWithArena(a, base) + SetValue(a, v, StringValue(a, "value"), "data", "newkey") + sinkValue = v + a.Reset() + } + }) + b.Run("new_path_depth4", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + base := []byte(`{}`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.ParseBytesWithArena(a, base) + SetValue(a, v, StringValue(a, "value"), "a", "b", "c", "d") + sinkValue = v + a.Reset() + } + }) +} + +func BenchmarkSTSetNull(b *testing.B) { + b.Run("depth_1", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + base := []byte(`{"data":"value"}`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.ParseBytesWithArena(a, base) + SetNull(a, v, "data") + sinkValue = v + a.Reset() + } + }) + b.Run("depth_3", func(b *testing.B) { + a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) + var p Parser + base := []byte(`{"a":{"b":{"c":"value"}}}`) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + v, _ := p.ParseBytesWithArena(a, base) + SetNull(a, v, "a", "b", "c") + sinkValue = v + a.Reset() + } + }) +} + +// --------------------------------------------------------------------------- +// Section 8: Validation +// --------------------------------------------------------------------------- + +func BenchmarkSTValidate(b *testing.B) { + for _, f := range fixtures { + b.Run(f.name, func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(len(f.data))) + b.ResetTimer() + for b.Loop() { + sinkErr = Validate(f.data) + } + }) + } +} + +// --------------------------------------------------------------------------- +// Section 9: Scanner +// --------------------------------------------------------------------------- + +func BenchmarkSTScanner(b *testing.B) { + // Concatenate 100 copies of small.json separated by whitespace + parts := make([]string, 100) + for i := range parts { + parts[i] = smallFixture + } + data := strings.Join(parts, "\n") + + var sc Scanner + b.ReportAllocs() + b.SetBytes(int64(len(data))) + b.ResetTimer() + for b.Loop() { + sc.Init(data) + for sc.Next() { + sinkValue = sc.Value() + } + if sc.Error() != nil { + b.Fatal(sc.Error()) + } + } +} + +// --------------------------------------------------------------------------- +// Section 10: End-to-End +// --------------------------------------------------------------------------- + +func BenchmarkSTParseAndGetMultiple(b *testing.B) { + b.Run("heap", func(b *testing.B) { + var p Parser + b.ReportAllocs() + b.SetBytes(int64(len(twitterFixture))) + b.ResetTimer() + for b.Loop() { + v, err := p.Parse(twitterFixture) + if err != nil { + b.Fatal(err) + } + _ = v.Get("statuses") + _ = v.Get("statuses", "0", "user") + _ = v.Get("search_metadata") + _ = v.Get("statuses", "0", "text") + sinkValue = v + } + }) + b.Run("arena", func(b *testing.B) { + var p Parser + a := arena.NewMonotonicArena(arena.WithMinBufferSize(2 * 1024 * 1024)) + b.ReportAllocs() + b.SetBytes(int64(len(twitterFixture))) + b.ResetTimer() + for b.Loop() { + v, err := p.ParseWithArena(a, twitterFixture) + if err != nil { + b.Fatal(err) + } + _ = v.Get("statuses") + _ = v.Get("statuses", "0", "user") + _ = v.Get("search_metadata") + _ = v.Get("statuses", "0", "text") + sinkValue = v + a.Reset() + } + }) +} + +func BenchmarkSTParseModifyMarshal(b *testing.B) { + var p Parser + data := smallFixture + dst := make([]byte, 0, len(data)*2) + b.ReportAllocs() + b.SetBytes(int64(len(data))) + b.ResetTimer() + for b.Loop() { + v, err := p.Parse(data) + if err != nil { + b.Fatal(err) + } + v.Set(nil, "newkey", StringValue(nil, "newval")) + sinkBytes = v.MarshalTo(dst[:0]) + } +} diff --git a/chartable.go b/chartable.go new file mode 100644 index 0000000..ddcc3f1 --- /dev/null +++ b/chartable.go @@ -0,0 +1,64 @@ +package astjson + +const ( + charWS uint8 = 1 << 0 // whitespace: space, tab, newline, CR + charNumChar uint8 = 1 << 1 // valid in number: digits, ., -, +, e, E + charEscape uint8 = 1 << 2 // needs escaping in JSON string: ", \, < 0x20 +) + +// charFlags is a 256-byte lookup table for character classification. +// Replaces multi-branch comparisons in hot loops with a single table lookup. +var charFlags [256]uint8 + +// hexDigit maps ASCII bytes to their hex digit value (0-15). +// Invalid hex chars are mapped to 0xFF. +var hexDigit [256]uint8 + +func init() { + // Whitespace + charFlags[0x20] |= charWS // space + charFlags[0x09] |= charWS // tab + charFlags[0x0A] |= charWS // newline + charFlags[0x0D] |= charWS // carriage return + + // Number characters + for c := byte('0'); c <= '9'; c++ { + charFlags[c] |= charNumChar + } + charFlags['.'] |= charNumChar + charFlags['-'] |= charNumChar + charFlags['+'] |= charNumChar + charFlags['e'] |= charNumChar + charFlags['E'] |= charNumChar + + // Characters that need escaping in JSON strings + charFlags['"'] |= charEscape + charFlags['\\'] |= charEscape + for c := range 0x20 { + charFlags[c] |= charEscape + } + + // Hex digit lookup (0xFF = invalid) + for i := range hexDigit { + hexDigit[i] = 0xFF + } + for c := byte('0'); c <= '9'; c++ { + hexDigit[c] = c - '0' + } + for c := byte('a'); c <= 'f'; c++ { + hexDigit[c] = c - 'a' + 10 + } + for c := byte('A'); c <= 'F'; c++ { + hexDigit[c] = c - 'A' + 10 + } +} + +// parseHex4 parses 4 hex digits from s into a uint16. +// Returns the value and true on success, or 0 and false on invalid input. +func parseHex4(s string) (uint16, bool) { + a, b, c, d := hexDigit[s[0]], hexDigit[s[1]], hexDigit[s[2]], hexDigit[s[3]] + if (a|b|c|d)&0xF0 != 0 { + return 0, false + } + return uint16(a)<<12 | uint16(b)<<8 | uint16(c)<<4 | uint16(d), true +} diff --git a/fastfloat/parse.go b/fastfloat/parse.go index 9a562a2..a91f61f 100644 --- a/fastfloat/parse.go +++ b/fastfloat/parse.go @@ -1,12 +1,18 @@ package fastfloat import ( - "fmt" + "errors" "math" "strconv" "strings" ) +var ( + errParseUint64Empty = errors.New("cannot parse uint64 from empty string") + errParseInt64Empty = errors.New("cannot parse int64 from empty string") + errParseFloat64Empty = errors.New("cannot parse float64 from empty string") +) + // ParseUint64BestEffort parses uint64 number s. // // It is equivalent to strconv.ParseUint(s, 10, 64), but is faster. @@ -54,7 +60,7 @@ func ParseUint64BestEffort(s string) uint64 { // See also ParseUint64BestEffort. func ParseUint64(s string) (uint64, error) { if len(s) == 0 { - return 0, fmt.Errorf("cannot parse uint64 from empty string") + return 0, errParseUint64Empty } i := uint(0) d := uint64(0) @@ -77,11 +83,11 @@ func ParseUint64(s string) (uint64, error) { break } if i <= j { - return 0, fmt.Errorf("cannot parse uint64 from %q", s) + return 0, errors.New("cannot parse uint64 from " + strconv.Quote(s)) } if i < uint(len(s)) { // Unparsed tail left. - return 0, fmt.Errorf("unparsed tail left after parsing uint64 from %q: %q", s, s[i:]) + return 0, errors.New("unparsed tail left after parsing uint64 from " + strconv.Quote(s) + ": " + strconv.Quote(s[i:])) } return d, nil } @@ -144,14 +150,14 @@ func ParseInt64BestEffort(s string) int64 { // See also ParseInt64BestEffort. func ParseInt64(s string) (int64, error) { if len(s) == 0 { - return 0, fmt.Errorf("cannot parse int64 from empty string") + return 0, errParseInt64Empty } i := uint(0) minus := s[0] == '-' if minus { i++ if i >= uint(len(s)) { - return 0, fmt.Errorf("cannot parse int64 from %q", s) + return 0, errors.New("cannot parse int64 from " + strconv.Quote(s)) } } @@ -175,11 +181,11 @@ func ParseInt64(s string) (int64, error) { break } if i <= j { - return 0, fmt.Errorf("cannot parse int64 from %q", s) + return 0, errors.New("cannot parse int64 from " + strconv.Quote(s)) } if i < uint(len(s)) { // Unparsed tail left. - return 0, fmt.Errorf("unparsed tail left after parsing int64 form %q: %q", s, s[i:]) + return 0, errors.New("unparsed tail left after parsing int64 from " + strconv.Quote(s) + ": " + strconv.Quote(s[i:])) } if minus { d = -d @@ -355,21 +361,21 @@ func ParseBestEffort(s string) float64 { // See also ParseBestEffort. func Parse(s string) (float64, error) { if len(s) == 0 { - return 0, fmt.Errorf("cannot parse float64 from empty string") + return 0, errParseFloat64Empty } i := uint(0) minus := s[0] == '-' if minus { i++ if i >= uint(len(s)) { - return 0, fmt.Errorf("cannot parse float64 from %q", s) + return 0, errors.New("cannot parse float64 from " + strconv.Quote(s)) } } // the integer part might be elided to remain compliant // with https://go.dev/ref/spec#Floating-point_literals if s[i] == '.' && (i+1 >= uint(len(s)) || s[i+1] < '0' || s[i+1] > '9') { - return 0, fmt.Errorf("missing integer and fractional part in %q", s) + return 0, errors.New("missing integer and fractional part in " + strconv.Quote(s)) } d := uint64(0) @@ -405,7 +411,7 @@ func Parse(s string) (float64, error) { if strings.EqualFold(ss, "nan") { return nan, nil } - return 0, fmt.Errorf("unparsed tail left after parsing float64 from %q: %q", s, ss) + return 0, errors.New("unparsed tail left after parsing float64 from " + strconv.Quote(s) + ": " + strconv.Quote(ss)) } f := float64(d) if i >= uint(len(s)) { @@ -433,7 +439,7 @@ func Parse(s string) (float64, error) { // The mantissa is out of range. Fall back to standard parsing. f, err := strconv.ParseFloat(s, 64) if err != nil && !math.IsInf(f, 0) { - return 0, fmt.Errorf("cannot parse mantissa in %q: %s", s, err) + return 0, errors.New("cannot parse mantissa in " + strconv.Quote(s) + ": " + err.Error()) } return f, nil } @@ -455,14 +461,14 @@ func Parse(s string) (float64, error) { // Parse exponent part. i++ if i >= uint(len(s)) { - return 0, fmt.Errorf("cannot parse exponent in %q", s) + return 0, errors.New("cannot parse exponent in " + strconv.Quote(s)) } expMinus := false if s[i] == '+' || s[i] == '-' { expMinus = s[i] == '-' i++ if i >= uint(len(s)) { - return 0, fmt.Errorf("cannot parse exponent in %q", s) + return 0, errors.New("cannot parse exponent in " + strconv.Quote(s)) } } exp := int16(0) @@ -476,7 +482,7 @@ func Parse(s string) (float64, error) { // Fall back to standard parsing. f, err := strconv.ParseFloat(s, 64) if err != nil && !math.IsInf(f, 0) { - return 0, fmt.Errorf("cannot parse exponent in %q: %s", s, err) + return 0, errors.New("cannot parse exponent in " + strconv.Quote(s) + ": " + err.Error()) } return f, nil } @@ -485,7 +491,7 @@ func Parse(s string) (float64, error) { break } if i <= j { - return 0, fmt.Errorf("cannot parse exponent in %q", s) + return 0, errors.New("cannot parse exponent in " + strconv.Quote(s)) } if expMinus { exp = -exp @@ -498,7 +504,7 @@ func Parse(s string) (float64, error) { return f, nil } } - return 0, fmt.Errorf("cannot parse float64 from %q", s) + return 0, errors.New("cannot parse float64 from " + strconv.Quote(s)) } var inf = math.Inf(1) diff --git a/mergevalues.go b/mergevalues.go index fbfb5fd..2b7c02e 100644 --- a/mergevalues.go +++ b/mergevalues.go @@ -1,7 +1,6 @@ package astjson import ( - "bytes" "errors" "github.com/wundergraph/go-arena" @@ -34,18 +33,24 @@ func MergeValues(ar arena.Arena, a, b *Value) (v *Value, changed bool, err error if b == nil { return a, false, nil } - if b.Type() == TypeNull && a.Type() == TypeObject { + at, bt := a.t, b.t + if bt == TypeNull && at == TypeObject { // we assume that null was returned in an error case for resolving a nested object field // as we've got an object on the left side, we don't override the whole object with null // instead, we keep the left object and discard the null on the right side return a, false, nil } - aBool, bBool := a.Type() == TypeTrue || a.Type() == TypeFalse, b.Type() == TypeTrue || b.Type() == TypeFalse - booleans := aBool && bBool - if a.Type() != b.Type() && !booleans { - return nil, false, ErrMergeDifferentTypes + if at != bt { + // Only compute boolean compatibility when types actually differ + aBool := at == TypeTrue || at == TypeFalse + bBool := bt == TypeTrue || bt == TypeFalse + if !aBool || !bBool { + return nil, false, ErrMergeDifferentTypes + } + // Types differ but both are booleans — b replaces a + return b, true, nil } - switch a.Type() { + switch at { case TypeObject: ao, _ := a.Object() bo, _ := b.Object() @@ -94,29 +99,23 @@ func MergeValues(ar arena.Arena, a, b *Value) (v *Value, changed bool, err error } } return a, false, nil - case TypeFalse: - if b.Type() == TypeTrue { - return b, true, nil - } - return a, false, nil - case TypeTrue: - if b.Type() == TypeFalse { - return b, true, nil - } - return a, false, nil - case TypeNull: + case TypeTrue, TypeFalse, TypeNull: + // at == bt guaranteed by the check above, no change needed return a, false, nil case TypeNumber: - af, _ := a.Float64() - bf, _ := b.Float64() - if af != bf { + // Fast path: if raw number strings are identical, values are equal. + // This avoids expensive float64 parsing in the common case. + if a.s == b.s { + return a, false, nil + } + af, aErr := a.Float64() + bf, bErr := b.Float64() + if aErr != nil || bErr != nil || af != bf { return b, true, nil } return a, false, nil case TypeString: - as, _ := a.StringBytes() - bs, _ := b.StringBytes() - if !bytes.Equal(as, bs) { + if a.s != b.s { return b, true, nil } return a, false, nil diff --git a/parser.go b/parser.go index bd7f1cf..67f1d89 100644 --- a/parser.go +++ b/parser.go @@ -1,7 +1,7 @@ package astjson import ( - "fmt" + "errors" "strconv" "strings" "unicode/utf16" @@ -11,6 +11,87 @@ import ( "github.com/wundergraph/go-arena" ) +// Sentinel errors for static error messages. +// Using pre-allocated errors avoids fmt.Errorf allocations and removes the fmt +// import, which can improve inlining budgets for functions in this file. +var ( + errEmptyString = errors.New("cannot parse empty string") + errMaxDepth = errors.New("too big depth for the nested JSON; it exceeds 300") + errMissingClosingBracket = errors.New("missing ']'") + errMissingClosingBrace = errors.New("missing '}'") + errMissingCommaArray = errors.New("missing ',' after array value") + errMissingCommaObject = errors.New("missing ',' after object value") + errUnexpectedEndArray = errors.New("unexpected end of array") + errUnexpectedEndObject = errors.New("unexpected end of object") + errMissingOpenQuote = errors.New(`cannot find opening '"' for object key`) + errMissingColon = errors.New("missing ':' after object key") + errMissingClosingQuote = errors.New(`missing closing '"'`) +) + +// parseContext holds per-parse state including slab allocators that amortize +// arena allocation overhead by allocating Values and kvs in batches. +type parseContext struct { + a arena.Arena + vs valueSlab + ks kvSlab +} + +// valueSlab allocates Values in batches to amortize arena overhead. +// Starts with a small batch and doubles up to maxSlabSize. +type valueSlab struct { + values []Value + pos int +} + +const ( + minSlabSize = 8 + maxSlabSize = 64 +) + +func (s *valueSlab) get(a arena.Arena) *Value { + if a == nil { + return new(Value) + } + if s.pos >= len(s.values) { + size := len(s.values) * 2 + if size < minSlabSize { + size = minSlabSize + } else if size > maxSlabSize { + size = maxSlabSize + } + s.values = arena.AllocateSlice[Value](a, size, size) + s.pos = 0 + } + v := &s.values[s.pos] + s.pos++ + return v +} + +// kvSlab allocates kv structs in batches to amortize arena overhead. +type kvSlab struct { + kvs []kv + pos int +} + +func (s *kvSlab) get(a arena.Arena) *kv { + if a == nil { + return new(kv) + } + if s.pos >= len(s.kvs) { + size := len(s.kvs) * 2 + if size < minSlabSize { + size = minSlabSize + } else if size > maxSlabSize { + size = maxSlabSize + } + s.kvs = arena.AllocateSlice[kv](a, size, size) + s.pos = 0 + } + k := &s.kvs[s.pos] + s.pos++ + return k +} + // ParseError wraps a JSON parsing error. type ParseError struct { Err error @@ -106,15 +187,16 @@ func (p *Parser) ParseBytesWithArena(a arena.Arena, b []byte) (*Value, error) { } func (p *Parser) parse(a arena.Arena, s string) (*Value, error) { + ctx := parseContext{a: a} s = skipWS(s) - v, tail, err := parseValue(a, s, 0) + v, tail, err := parseValue(&ctx, s, 0) if err != nil { - return nil, NewParseError(fmt.Errorf("cannot parse JSON: %s; unparsed tail: %q", err, startEndString(tail))) + return nil, NewParseError(errors.New("cannot parse JSON: " + err.Error() + "; unparsed tail: " + strconv.Quote(startEndString(tail)))) } tail = skipWS(tail) if len(tail) > 0 { - return nil, NewParseError(fmt.Errorf("unexpected tail: %q", startEndString(tail))) + return nil, NewParseError(errors.New("unexpected tail: " + strconv.Quote(startEndString(tail)))) } return v, nil } @@ -131,15 +213,9 @@ func skipWSSlow(s string) string { if len(s) == 0 { return s } - - // Branch prediction optimization: check most common whitespace first - // Space (0x20) is most common, then newline, tab, carriage return for i := 0; i < len(s); i++ { - c := s[i] - if c != 0x20 { // Most common whitespace - if c != 0x0A && c != 0x09 && c != 0x0D { - return s[i:] - } + if charFlags[s[i]]&charWS == 0 { + return s[i:] } } return "" @@ -157,13 +233,13 @@ type kv struct { // MaxDepth is the maximum depth for nested JSON. const MaxDepth = 300 -func parseValue(a arena.Arena, s string, depth int) (*Value, string, error) { +func parseValue(ctx *parseContext, s string, depth int) (*Value, string, error) { if len(s) == 0 { - return nil, s, fmt.Errorf("cannot parse empty string") + return nil, s, errEmptyString } depth++ if depth > MaxDepth { - return nil, s, fmt.Errorf("too big depth for the nested JSON; it exceeds %d", MaxDepth) + return nil, s, errMaxDepth } // Branch prediction optimization: order by frequency @@ -173,99 +249,98 @@ func parseValue(a arena.Arena, s string, depth int) (*Value, string, error) { // String - most common in JSON ss, tail, err := parseRawString(s[1:]) if err != nil { - return nil, tail, fmt.Errorf("cannot parse string: %s", err) + return nil, tail, errors.New("cannot parse string: " + err.Error()) } - v := arena.Allocate[Value](a) + v := ctx.vs.get(ctx.a) v.t = TypeString - v.s = unescapeStringBestEffort(a, ss) + v.s = unescapeStringBestEffort(ctx.a, ss) return v, tail, nil case '{': // Object - very common - v, tail, err := parseObject(a, s[1:], depth) + v, tail, err := parseObject(ctx, s[1:], depth) if err != nil { - return nil, tail, fmt.Errorf("cannot parse object: %s", err) + return nil, tail, errors.New("cannot parse object: " + err.Error()) } return v, tail, nil case '[': // Array - common - v, tail, err := parseArray(a, s[1:], depth) + v, tail, err := parseArray(ctx, s[1:], depth) if err != nil { - return nil, tail, fmt.Errorf("cannot parse array: %s", err) + return nil, tail, errors.New("cannot parse array: " + err.Error()) } return v, tail, nil case 't': // true literal - less common if len(s) < len("true") || s[:len("true")] != "true" { - return nil, s, fmt.Errorf("unexpected value found: %q", s) + return nil, s, errors.New("unexpected value found: " + strconv.Quote(s)) } return valueTrue, s[len("true"):], nil case 'f': // false literal - less common if len(s) < len("false") || s[:len("false")] != "false" { - return nil, s, fmt.Errorf("unexpected value found: %q", s) + return nil, s, errors.New("unexpected value found: " + strconv.Quote(s)) } return valueFalse, s[len("false"):], nil case 'n': // null literal - less common if len(s) < len("null") || s[:len("null")] != "null" { // Try parsing NaN - if len(s) >= 3 && strings.EqualFold(s[:3], "nan") { - v := arena.Allocate[Value](a) + if len(s) >= 3 && (s[0]|0x20) == 'n' && (s[1]|0x20) == 'a' && (s[2]|0x20) == 'n' { + v := ctx.vs.get(ctx.a) v.t = TypeNumber v.s = s[:3] return v, s[3:], nil } - return nil, s, fmt.Errorf("unexpected value found: %q", s) + return nil, s, errors.New("unexpected value found: " + strconv.Quote(s)) } return valueNull, s[len("null"):], nil default: // Number - very common, but handled last due to complex parsing ns, tail, err := parseRawNumber(s) if err != nil { - return nil, tail, fmt.Errorf("cannot parse number: %s", err) + return nil, tail, errors.New("cannot parse number: " + err.Error()) } - v := arena.Allocate[Value](a) + v := ctx.vs.get(ctx.a) v.t = TypeNumber v.s = ns return v, tail, nil } } -func parseArray(a arena.Arena, s string, depth int) (*Value, string, error) { +func parseArray(ctx *parseContext, s string, depth int) (*Value, string, error) { s = skipWS(s) if len(s) == 0 { - return nil, s, fmt.Errorf("missing ']'") + return nil, s, errMissingClosingBracket } if s[0] == ']' { - v := arena.Allocate[Value](a) + v := ctx.vs.get(ctx.a) v.t = TypeArray v.a = v.a[:0] return v, s[1:], nil } - arr := arena.Allocate[Value](a) + arr := ctx.vs.get(ctx.a) arr.t = TypeArray - arr.a = arr.a[:0] + arr.a = arena.AllocateSlice[*Value](ctx.a, 0, 8) for { var v *Value var err error s = skipWS(s) - v, s, err = parseValue(a, s, depth) + v, s, err = parseValue(ctx, s, depth) if err != nil { - return nil, s, fmt.Errorf("cannot parse array value: %s", err) + return nil, s, errors.New("cannot parse array value: " + err.Error()) } - if arr.a == nil { - arr.a = arena.AllocateSlice[*Value](a, 1, 1) - arr.a[0] = v + if len(arr.a) < cap(arr.a) { + arr.a = append(arr.a, v) } else { - arr.a = arena.SliceAppend(a, arr.a, v) + arr.a = arena.SliceAppend(ctx.a, arr.a, v) } s = skipWS(s) if len(s) == 0 { - return nil, s, fmt.Errorf("unexpected end of array") + return nil, s, errUnexpectedEndArray } if s[0] == ',' { s = s[1:] @@ -275,56 +350,63 @@ func parseArray(a arena.Arena, s string, depth int) (*Value, string, error) { s = s[1:] return arr, s, nil } - return nil, s, fmt.Errorf("missing ',' after array value") + return nil, s, errMissingCommaArray } } -func parseObject(a arena.Arena, s string, depth int) (*Value, string, error) { +func parseObject(ctx *parseContext, s string, depth int) (*Value, string, error) { s = skipWS(s) if len(s) == 0 { - return nil, s, fmt.Errorf("missing '}'") + return nil, s, errMissingClosingBrace } if s[0] == '}' { - v := arena.Allocate[Value](a) + v := ctx.vs.get(ctx.a) v.t = TypeObject v.o.reset() return v, s[1:], nil } - o := arena.Allocate[Value](a) + o := ctx.vs.get(ctx.a) o.t = TypeObject - o.o.reset() + o.o.kvs = arena.AllocateSlice[*kv](ctx.a, 0, 8) for { var err error - kv := o.o.getKV(a) + // Inline kv allocation from slab instead of calling getKV + // (getKV is kept unchanged for Object.Set in update.go) + newKV := ctx.ks.get(ctx.a) + if len(o.o.kvs) < cap(o.o.kvs) { + o.o.kvs = append(o.o.kvs, newKV) + } else { + o.o.kvs = arena.SliceAppend(ctx.a, o.o.kvs, newKV) + } // Parse key. s = skipWS(s) if len(s) == 0 || s[0] != '"' { - return nil, s, fmt.Errorf(`cannot find opening '"" for object key`) + return nil, s, errMissingOpenQuote } - kv.k, s, err = parseRawKey(s[1:]) + newKV.k, s, err = parseRawKey(s[1:]) if err != nil { - return nil, s, fmt.Errorf("cannot parse object key: %s", err) + return nil, s, errors.New("cannot parse object key: " + err.Error()) } - kv.k = unescapeStringBestEffort(a, kv.k) - kv.keyUnescaped = true + newKV.k = unescapeStringBestEffort(ctx.a, newKV.k) + newKV.keyUnescaped = true s = skipWS(s) if len(s) == 0 || s[0] != ':' { - return nil, s, fmt.Errorf("missing ':' after object key") + return nil, s, errMissingColon } s = s[1:] // Parse value s = skipWS(s) - kv.v, s, err = parseValue(a, s, depth) + newKV.v, s, err = parseValue(ctx, s, depth) if err != nil { - return nil, s, fmt.Errorf("cannot parse object value: %s", err) + return nil, s, errors.New("cannot parse object value: " + err.Error()) } s = skipWS(s) if len(s) == 0 { - return nil, s, fmt.Errorf("unexpected end of object") + return nil, s, errUnexpectedEndObject } if s[0] == ',' { s = s[1:] @@ -333,7 +415,7 @@ func parseObject(a arena.Arena, s string, depth int) (*Value, string, error) { if s[0] == '}' { return o, s[1:], nil } - return nil, s, fmt.Errorf("missing ',' after object value") + return nil, s, errMissingCommaObject } } @@ -351,15 +433,8 @@ func escapeString(dst []byte, s string) []byte { } func hasSpecialChars(s string) bool { - // Branch prediction optimization: check most common cases first for i := 0; i < len(s); i++ { - c := s[i] - // Most common special chars first - if c == '"' || c == '\\' { - return true - } - // Control characters - less common - if c < 0x20 { + if charFlags[s[i]]&charEscape != 0 { return true } } @@ -411,12 +486,10 @@ func unescapeStringBestEffort(a arena.Arena, s string) string { return s } - // Estimate capacity to avoid frequent reallocations - estimatedCap := len(s) + 4 - b := arena.AllocateSlice[byte](a, 0, estimatedCap) - - // Add the initial part before the first escape - b = arena.SliceAppend(a, b, []byte(s[:n])...) + // Pre-allocate buffer to len(s) — unescaped is always <= escaped length. + // Use direct indexing instead of per-character SliceAppend. + b := arena.AllocateSlice[byte](a, len(s), len(s)) + w := copy(b, s[:n]) s = s[n+1:] for len(s) > 0 { @@ -424,95 +497,104 @@ func unescapeStringBestEffort(a arena.Arena, s string) string { s = s[1:] switch ch { case '"': - b = arena.SliceAppend(a, b, '"') + b[w] = '"' + w++ case '\\': - b = arena.SliceAppend(a, b, '\\') + b[w] = '\\' + w++ case '/': - b = arena.SliceAppend(a, b, '/') + b[w] = '/' + w++ case 'b': - b = arena.SliceAppend(a, b, '\b') + b[w] = '\b' + w++ case 'f': - b = arena.SliceAppend(a, b, '\f') + b[w] = '\f' + w++ case 'n': - b = arena.SliceAppend(a, b, '\n') + b[w] = '\n' + w++ case 'r': - b = arena.SliceAppend(a, b, '\r') + b[w] = '\r' + w++ case 't': - b = arena.SliceAppend(a, b, '\t') + b[w] = '\t' + w++ case 'u': if len(s) < 4 { - // Too short escape sequence. Just store it unchanged. - b = arena.SliceAppend(a, b, []byte("\\u")...) + b[w] = '\\' + b[w+1] = 'u' + w += 2 break } xs := s[:4] - x, err := strconv.ParseUint(xs, 16, 16) - if err != nil { - // Invalid escape sequence. Just store it unchanged. - b = arena.SliceAppend(a, b, []byte("\\u")...) + x, ok := parseHex4(xs) + if !ok { + b[w] = '\\' + b[w+1] = 'u' + w += 2 break } s = s[4:] if !utf16.IsSurrogate(rune(x)) { - var buf [utf8.UTFMax]byte - n := utf8.EncodeRune(buf[:], rune(x)) - b = arena.SliceAppend(a, b, buf[:n]...) + w += utf8.EncodeRune(b[w:], rune(x)) break } // Surrogate. // See https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates if len(s) < 6 || s[0] != '\\' || s[1] != 'u' { - b = arena.SliceAppend(a, b, []byte("\\u")...) - b = arena.SliceAppend(a, b, []byte(xs)...) + b[w] = '\\' + b[w+1] = 'u' + w += 2 + w += copy(b[w:], xs) break } - x1, err := strconv.ParseUint(s[2:6], 16, 16) - if err != nil { - b = arena.SliceAppend(a, b, []byte("\\u")...) - b = arena.SliceAppend(a, b, []byte(xs)...) + x1, ok := parseHex4(s[2:6]) + if !ok { + b[w] = '\\' + b[w+1] = 'u' + w += 2 + w += copy(b[w:], xs) break } r := utf16.DecodeRune(rune(x), rune(x1)) - var buf [utf8.UTFMax]byte - rn := utf8.EncodeRune(buf[:], r) - b = arena.SliceAppend(a, b, buf[:rn]...) + w += utf8.EncodeRune(b[w:], r) s = s[6:] default: - // Unknown escape sequence. Just store it unchanged. - b = arena.SliceAppend(a, b, '\\', ch) + b[w] = '\\' + b[w+1] = ch + w += 2 } n = strings.IndexByte(s, '\\') if n < 0 { - b = arena.SliceAppend(a, b, []byte(s)...) + w += copy(b[w:], s) break } - b = arena.SliceAppend(a, b, []byte(s[:n])...) + w += copy(b[w:], s[:n]) s = s[n+1:] } - return b2s(b) + return b2s(b[:w]) } // parseRawKey is similar to parseRawString, but is optimized // for small-sized keys without escape sequences. func parseRawKey(s string) (string, string, error) { - for i := 0; i < len(s); i++ { - if s[i] == '"' { - // Fast path. - return s[:i], s[i+1:], nil - } - if s[i] == '\\' { - // Slow path. - return parseRawString(s) - } + n := strings.IndexByte(s, '"') + if n < 0 { + return s, "", errMissingClosingQuote + } + // Check if the key portion contains an escape sequence. + if strings.IndexByte(s[:n], '\\') >= 0 { + return parseRawString(s) } - return s, "", fmt.Errorf(`missing closing '"'`) + return s[:n], s[n+1:], nil } func parseRawString(s string) (string, string, error) { n := strings.IndexByte(s, '"') if n < 0 { - return s, "", fmt.Errorf(`missing closing '"'`) + return s, "", errMissingClosingQuote } if n == 0 || s[n-1] != '\\' { // Fast path. No escaped ". @@ -533,7 +615,7 @@ func parseRawString(s string) (string, string, error) { n = strings.IndexByte(s, '"') if n < 0 { - return ss, "", fmt.Errorf(`missing closing '"'`) + return ss, "", errMissingClosingQuote } if n == 0 || s[n-1] != '\\' { return ss[:len(ss)-len(s)+n], s[n+1:], nil @@ -546,18 +628,18 @@ func parseRawNumber(s string) (string, string, error) { // Find the end of the number. for i := 0; i < len(s); i++ { - ch := s[i] - if (ch >= '0' && ch <= '9') || ch == '.' || ch == '-' || ch == 'e' || ch == 'E' || ch == '+' { + if charFlags[s[i]]&charNumChar != 0 { continue } if i == 0 || i == 1 && (s[0] == '-' || s[0] == '+') { if len(s[i:]) >= 3 { xs := s[i : i+3] - if strings.EqualFold(xs, "inf") || strings.EqualFold(xs, "nan") { + if ((xs[0]|0x20) == 'i' && (xs[1]|0x20) == 'n' && (xs[2]|0x20) == 'f') || + ((xs[0]|0x20) == 'n' && (xs[1]|0x20) == 'a' && (xs[2]|0x20) == 'n') { return s[:i+3], s[i+3:], nil } } - return "", s, fmt.Errorf("unexpected char: %q", s[:1]) + return "", s, errors.New("unexpected char: " + strconv.Quote(s[:1])) } ns := s[:i] s = s[i:] @@ -573,12 +655,13 @@ func parseRawNumber(s string) (string, string, error) { // // Cache-friendly layout: hot data first type Object struct { - kvs []*kv // HOT: frequently accessed - 24 bytes - // Total: 24 bytes - compact and cache-friendly + kvs []*kv // HOT: frequently accessed + kvIndex map[string]int // lazily built on first Get when len(kvs) > 16 } func (o *Object) reset() { o.kvs = o.kvs[:0] + o.kvIndex = nil } // MarshalTo appends marshaled o to dst and returns the result. @@ -615,9 +698,14 @@ func (o *Object) String() string { func (o *Object) getKV(a arena.Arena) *kv { if o.kvs == nil { - o.kvs = arena.AllocateSlice[*kv](a, 0, 1) + o.kvs = arena.AllocateSlice[*kv](a, 0, 4) + } + newKV := arena.Allocate[kv](a) + if len(o.kvs) < cap(o.kvs) { + o.kvs = append(o.kvs, newKV) + } else { + o.kvs = arena.SliceAppend(a, o.kvs, newKV) } - o.kvs = arena.SliceAppend(a, o.kvs, arena.Allocate[kv](a)) return o.kvs[len(o.kvs)-1] } @@ -642,6 +730,19 @@ func (o *Object) Get(key string) *Value { if o == nil { return nil } + // For large objects, use a lazily-built hash map for O(1) lookup. + if len(o.kvs) > 16 { + if o.kvIndex == nil { + o.kvIndex = make(map[string]int, len(o.kvs)) + for i, kv := range o.kvs { + o.kvIndex[kv.k] = i + } + } + if i, ok := o.kvIndex[key]; ok { + return o.kvs[i].v + } + return nil + } // Keys are always pre-unescaped during parsing and Object.Set, // so direct comparison is sufficient. for _, kv := range o.kvs { @@ -675,11 +776,10 @@ func (o *Object) Visit(f func(key []byte, v *Value)) { // // Cache-friendly layout: hot data first, compact structure type Value struct { - t Type // HOT: accessed on every operation - 8 bytes + t Type // HOT: accessed on every operation - 1 byte s string // HOT: frequently accessed for strings/numbers - 16 bytes a []*Value // HOT: frequently accessed for arrays - 24 bytes - o Object // COLD: less frequently accessed - 25 bytes - // Total: 73 bytes - compact and cache-friendly + o Object // COLD: less frequently accessed - 24 bytes } // MarshalTo appends marshaled v to dst and returns the result. @@ -708,7 +808,7 @@ func (v *Value) MarshalTo(dst []byte) []byte { case TypeNull: return append(dst, "null"...) default: - panic(fmt.Errorf("BUG: unexpected Value type: %d", v.t)) + panic("BUG: unexpected Value type: " + strconv.Itoa(int(v.t))) } } @@ -727,7 +827,7 @@ func (v *Value) String() string { } // Type represents JSON type. -type Type int +type Type uint8 const ( // TypeNull is JSON null. @@ -773,7 +873,7 @@ func (t Type) String() string { // typeRawString is skipped intentionally, // since it shouldn't be visible to user. default: - panic(fmt.Errorf("BUG: unknown Value type: %d", t)) + panic("BUG: unknown Value type: " + strconv.Itoa(int(t))) } } @@ -953,7 +1053,7 @@ func (v *Value) GetBool(keys ...string) bool { // Use GetObject if you don't need error handling. func (v *Value) Object() (*Object, error) { if v.t != TypeObject { - return nil, fmt.Errorf("value doesn't contain object; it contains %s", v.Type()) + return nil, errors.New("value doesn't contain object; it contains " + v.Type().String()) } return &v.o, nil } @@ -965,7 +1065,7 @@ func (v *Value) Object() (*Object, error) { // Use GetArray if you don't need error handling. func (v *Value) Array() ([]*Value, error) { if v.t != TypeArray { - return nil, fmt.Errorf("value doesn't contain array; it contains %s", v.Type()) + return nil, errors.New("value doesn't contain array; it contains " + v.Type().String()) } return v.a, nil } @@ -977,7 +1077,7 @@ func (v *Value) Array() ([]*Value, error) { // Use GetStringBytes if you don't need error handling. func (v *Value) StringBytes() ([]byte, error) { if v.Type() != TypeString { - return nil, fmt.Errorf("value doesn't contain string; it contains %s", v.Type()) + return nil, errors.New("value doesn't contain string; it contains " + v.Type().String()) } return s2b(v.s), nil } @@ -987,7 +1087,7 @@ func (v *Value) StringBytes() ([]byte, error) { // Use GetFloat64 if you don't need error handling. func (v *Value) Float64() (float64, error) { if v.Type() != TypeNumber { - return 0, fmt.Errorf("value doesn't contain number; it contains %s", v.Type()) + return 0, errors.New("value doesn't contain number; it contains " + v.Type().String()) } return fastfloat.Parse(v.s) } @@ -997,7 +1097,7 @@ func (v *Value) Float64() (float64, error) { // Use GetInt if you don't need error handling. func (v *Value) Int() (int, error) { if v.Type() != TypeNumber { - return 0, fmt.Errorf("value doesn't contain number; it contains %s", v.Type()) + return 0, errors.New("value doesn't contain number; it contains " + v.Type().String()) } n, err := fastfloat.ParseInt64(v.s) if err != nil { @@ -1011,7 +1111,7 @@ func (v *Value) Int() (int, error) { // Use GetInt if you don't need error handling. func (v *Value) Uint() (uint, error) { if v.Type() != TypeNumber { - return 0, fmt.Errorf("value doesn't contain number; it contains %s", v.Type()) + return 0, errors.New("value doesn't contain number; it contains " + v.Type().String()) } n, err := fastfloat.ParseUint64(v.s) if err != nil { @@ -1025,7 +1125,7 @@ func (v *Value) Uint() (uint, error) { // Use GetInt64 if you don't need error handling. func (v *Value) Int64() (int64, error) { if v.Type() != TypeNumber { - return 0, fmt.Errorf("value doesn't contain number; it contains %s", v.Type()) + return 0, errors.New("value doesn't contain number; it contains " + v.Type().String()) } return fastfloat.ParseInt64(v.s) } @@ -1035,7 +1135,7 @@ func (v *Value) Int64() (int64, error) { // Use GetInt64 if you don't need error handling. func (v *Value) Uint64() (uint64, error) { if v.Type() != TypeNumber { - return 0, fmt.Errorf("value doesn't contain number; it contains %s", v.Type()) + return 0, errors.New("value doesn't contain number; it contains " + v.Type().String()) } return fastfloat.ParseUint64(v.s) } @@ -1050,7 +1150,7 @@ func (v *Value) Bool() (bool, error) { if v.t == TypeFalse { return false, nil } - return false, fmt.Errorf("value doesn't contain bool; it contains %s", v.Type()) + return false, errors.New("value doesn't contain bool; it contains " + v.Type().String()) } var ( diff --git a/parser_test.go b/parser_test.go index de67168..a1b2ebf 100644 --- a/parser_test.go +++ b/parser_test.go @@ -1672,7 +1672,7 @@ func TestObjectGetEdgeCases(t *testing.T) { // TestValueMarshalToEdgeCases tests edge cases in Value.MarshalTo func TestValueMarshalToEdgeCases(t *testing.T) { t.Run("unknown type", func(t *testing.T) { - v := &Value{t: Type(999)} // Invalid type + v := &Value{t: Type(255)} // Invalid type defer func() { if r := recover(); r == nil { t.Errorf("expected panic for unknown type") @@ -1685,7 +1685,7 @@ func TestValueMarshalToEdgeCases(t *testing.T) { // TestTypeStringEdgeCases tests edge cases in Type.String func TestTypeStringEdgeCases(t *testing.T) { t.Run("unknown type", func(t *testing.T) { - tp := Type(999) // Invalid type + tp := Type(255) // Invalid type defer func() { if r := recover(); r == nil { t.Errorf("expected panic for unknown type") diff --git a/scanner.go b/scanner.go index bd5474f..20d209e 100644 --- a/scanner.go +++ b/scanner.go @@ -61,7 +61,8 @@ func (sc *Scanner) Next() bool { return false } - v, tail, err := parseValue(nil, sc.s, 0) + ctx := parseContext{} // heap mode: a == nil + v, tail, err := parseValue(&ctx, sc.s, 0) if err != nil { sc.err = err return false diff --git a/update.go b/update.go index 30e04e3..19d891c 100644 --- a/update.go +++ b/update.go @@ -17,6 +17,7 @@ func (o *Object) Del(key string) { if kv.k == key { o.kvs = append(o.kvs[:i], o.kvs[i+1:]...) o.kvs[:len(o.kvs)+1][len(o.kvs)] = nil // clear hidden slot for GC + o.kvIndex = nil // invalidate index return } } @@ -75,6 +76,7 @@ func (o *Object) Set(a arena.Arena, key string, value *Value) { kv.k = arenaString(a, key) kv.v = value kv.keyUnescaped = true // New keys are already unescaped since they come from user input + o.kvIndex = nil // invalidate index } // Set sets (key, value) entry in the array or object v. diff --git a/validate.go b/validate.go index 5061062..0a5a0bf 100644 --- a/validate.go +++ b/validate.go @@ -1,22 +1,40 @@ package astjson import ( - "fmt" + "errors" "strconv" "strings" ) +var ( + errValEmptyString = errors.New("cannot parse empty string") + errValMissingBracket = errors.New("missing ']'") + errValMissingBrace = errors.New("missing '}'") + errValMissingCommaArr = errors.New("missing ',' after array value") + errValMissingCommaObj = errors.New("missing ',' after object value") + errValEndArray = errors.New("unexpected end of array") + errValEndObject = errors.New("unexpected end of object") + errValMissingOpenQuote = errors.New(`cannot find opening '"' for object key`) + errValMissingColon = errors.New("missing ':' after object key") + errValMissingCloseQuote = errors.New(`missing closing '"'`) + errValZeroLenNumber = errors.New("zero-length number") + errValMissingAfterMinus = errors.New("missing number after minus") + errValUnexpectedZero = errors.New("unexpected number starting from 0") + errValMissingFractional = errors.New("missing fractional part") + errValMissingExponent = errors.New("missing exponent part") +) + // Validate validates JSON s. func Validate(s string) error { s = skipWS(s) tail, err := validateValue(s) if err != nil { - return fmt.Errorf("cannot parse JSON: %s; unparsed tail: %q", err, startEndString(tail)) + return errors.New("cannot parse JSON: " + err.Error() + "; unparsed tail: " + strconv.Quote(startEndString(tail))) } tail = skipWS(tail) if len(tail) > 0 { - return fmt.Errorf("unexpected tail: %q", startEndString(tail)) + return errors.New("unexpected tail: " + strconv.Quote(startEndString(tail))) } return nil } @@ -28,58 +46,58 @@ func ValidateBytes(b []byte) error { func validateValue(s string) (string, error) { if len(s) == 0 { - return s, fmt.Errorf("cannot parse empty string") + return s, errValEmptyString } if s[0] == '{' { tail, err := validateObject(s[1:]) if err != nil { - return tail, fmt.Errorf("cannot parse object: %s", err) + return tail, errors.New("cannot parse object: " + err.Error()) } return tail, nil } if s[0] == '[' { tail, err := validateArray(s[1:]) if err != nil { - return tail, fmt.Errorf("cannot parse array: %s", err) + return tail, errors.New("cannot parse array: " + err.Error()) } return tail, nil } if s[0] == '"' { sv, tail, err := validateString(s[1:]) if err != nil { - return tail, fmt.Errorf("cannot parse string: %s", err) + return tail, errors.New("cannot parse string: " + err.Error()) } // Scan the string for control chars. for i := 0; i < len(sv); i++ { if sv[i] < 0x20 { - return tail, fmt.Errorf("string cannot contain control char 0x%02X", sv[i]) + return tail, errors.New("string cannot contain control char 0x" + strconv.FormatUint(uint64(sv[i]), 16)) } } return tail, nil } if s[0] == 't' { if len(s) < len("true") || s[:len("true")] != "true" { - return s, fmt.Errorf("unexpected value found: %q", s) + return s, errors.New("unexpected value found: " + strconv.Quote(s)) } return s[len("true"):], nil } if s[0] == 'f' { if len(s) < len("false") || s[:len("false")] != "false" { - return s, fmt.Errorf("unexpected value found: %q", s) + return s, errors.New("unexpected value found: " + strconv.Quote(s)) } return s[len("false"):], nil } if s[0] == 'n' { if len(s) < len("null") || s[:len("null")] != "null" { - return s, fmt.Errorf("unexpected value found: %q", s) + return s, errors.New("unexpected value found: " + strconv.Quote(s)) } return s[len("null"):], nil } tail, err := validateNumber(s) if err != nil { - return tail, fmt.Errorf("cannot parse number: %s", err) + return tail, errors.New("cannot parse number: " + err.Error()) } return tail, nil } @@ -87,7 +105,7 @@ func validateValue(s string) (string, error) { func validateArray(s string) (string, error) { s = skipWS(s) if len(s) == 0 { - return s, fmt.Errorf("missing ']'") + return s, errValMissingBracket } if s[0] == ']' { return s[1:], nil @@ -99,12 +117,12 @@ func validateArray(s string) (string, error) { s = skipWS(s) s, err = validateValue(s) if err != nil { - return s, fmt.Errorf("cannot parse array value: %s", err) + return s, errors.New("cannot parse array value: " + err.Error()) } s = skipWS(s) if len(s) == 0 { - return s, fmt.Errorf("unexpected end of array") + return s, errValEndArray } if s[0] == ',' { s = s[1:] @@ -114,14 +132,14 @@ func validateArray(s string) (string, error) { s = s[1:] return s, nil } - return s, fmt.Errorf("missing ',' after array value") + return s, errValMissingCommaArr } } func validateObject(s string) (string, error) { s = skipWS(s) if len(s) == 0 { - return s, fmt.Errorf("missing '}'") + return s, errValMissingBrace } if s[0] == '}' { return s[1:], nil @@ -133,23 +151,23 @@ func validateObject(s string) (string, error) { // Parse key. s = skipWS(s) if len(s) == 0 || s[0] != '"' { - return s, fmt.Errorf(`cannot find opening '"" for object key`) + return s, errValMissingOpenQuote } var key string key, s, err = validateKey(s[1:]) if err != nil { - return s, fmt.Errorf("cannot parse object key: %s", err) + return s, errors.New("cannot parse object key: " + err.Error()) } // Scan the key for control chars. for i := 0; i < len(key); i++ { if key[i] < 0x20 { - return s, fmt.Errorf("object key cannot contain control char 0x%02X", key[i]) + return s, errors.New("object key cannot contain control char 0x" + strconv.FormatUint(uint64(key[i]), 16)) } } s = skipWS(s) if len(s) == 0 || s[0] != ':' { - return s, fmt.Errorf("missing ':' after object key") + return s, errValMissingColon } s = s[1:] @@ -157,11 +175,11 @@ func validateObject(s string) (string, error) { s = skipWS(s) s, err = validateValue(s) if err != nil { - return s, fmt.Errorf("cannot parse object value: %s", err) + return s, errors.New("cannot parse object value: " + err.Error()) } s = skipWS(s) if len(s) == 0 { - return s, fmt.Errorf("unexpected end of object") + return s, errValEndObject } if s[0] == ',' { s = s[1:] @@ -170,7 +188,7 @@ func validateObject(s string) (string, error) { if s[0] == '}' { return s[1:], nil } - return s, fmt.Errorf("missing ',' after object value") + return s, errValMissingCommaObj } } @@ -187,7 +205,7 @@ func validateKey(s string) (string, string, error) { return validateString(s) } } - return "", s, fmt.Errorf(`missing closing '"'`) + return "", s, errValMissingCloseQuote } func validateString(s string) (string, string, error) { @@ -215,28 +233,28 @@ func validateString(s string) (string, string, error) { continue case 'u': if len(rs) < 4 { - return rs, tail, fmt.Errorf(`too short escape sequence: \u%s`, rs) + return rs, tail, errors.New(`too short escape sequence: \u` + rs) } xs := rs[:4] _, err := strconv.ParseUint(xs, 16, 16) if err != nil { - return rs, tail, fmt.Errorf(`invalid escape sequence \u%s: %s`, xs, err) + return rs, tail, errors.New(`invalid escape sequence \u` + xs + ": " + err.Error()) } rs = rs[4:] default: - return rs, tail, fmt.Errorf(`unknown escape sequence \%c`, ch) + return rs, tail, errors.New(`unknown escape sequence \` + string(ch)) } } } func validateNumber(s string) (string, error) { if len(s) == 0 { - return s, fmt.Errorf("zero-length number") + return s, errValZeroLenNumber } if s[0] == '-' { s = s[1:] if len(s) == 0 { - return s, fmt.Errorf("missing number after minus") + return s, errValMissingAfterMinus } } i := 0 @@ -247,10 +265,10 @@ func validateNumber(s string) (string, error) { i++ } if i <= 0 { - return s, fmt.Errorf("expecting 0..9 digit, got %c", s[0]) + return s, errors.New("expecting 0..9 digit, got " + string(s[0])) } if s[0] == '0' && i != 1 { - return s, fmt.Errorf("unexpected number starting from 0") + return s, errValUnexpectedZero } if i >= len(s) { return "", nil @@ -259,7 +277,7 @@ func validateNumber(s string) (string, error) { // Validate fractional part s = s[i+1:] if len(s) == 0 { - return s, fmt.Errorf("missing fractional part") + return s, errValMissingFractional } i = 0 for i < len(s) { @@ -269,7 +287,7 @@ func validateNumber(s string) (string, error) { i++ } if i == 0 { - return s, fmt.Errorf("expecting 0..9 digit in fractional part, got %c", s[0]) + return s, errors.New("expecting 0..9 digit in fractional part, got " + string(s[0])) } if i >= len(s) { return "", nil @@ -279,12 +297,12 @@ func validateNumber(s string) (string, error) { // Validate exponent part s = s[i+1:] if len(s) == 0 { - return s, fmt.Errorf("missing exponent part") + return s, errValMissingExponent } if s[0] == '-' || s[0] == '+' { s = s[1:] if len(s) == 0 { - return s, fmt.Errorf("missing exponent part") + return s, errValMissingExponent } } i = 0 @@ -295,7 +313,7 @@ func validateNumber(s string) (string, error) { i++ } if i == 0 { - return s, fmt.Errorf("expecting 0..9 digit in exponent part, got %c", s[0]) + return s, errors.New("expecting 0..9 digit in exponent part, got " + string(s[0])) } if i >= len(s) { return "", nil From 04b2d94d781e0c0505186cb04239859fd1bd64f1 Mon Sep 17 00:00:00 2001 From: Jens Neuse Date: Tue, 3 Mar 2026 00:01:04 +0100 Subject: [PATCH 2/4] fix: kvIndex duplicate key semantics and add review comments - Object.Get kvIndex now stores first occurrence to match linear scan first-wins behavior for duplicate keys - Add doc comments for kvIndex field, parseHex4 bit-trick, and MergeValues number comparison error handling Co-Authored-By: Claude Opus 4.6 --- chartable.go | 1 + mergevalues.go | 2 ++ parser.go | 7 +++++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/chartable.go b/chartable.go index ddcc3f1..0763cfe 100644 --- a/chartable.go +++ b/chartable.go @@ -57,6 +57,7 @@ func init() { // Returns the value and true on success, or 0 and false on invalid input. func parseHex4(s string) (uint16, bool) { a, b, c, d := hexDigit[s[0]], hexDigit[s[1]], hexDigit[s[2]], hexDigit[s[3]] + // Valid hex digits are 0..15 (low nibble); invalid sentinel 0xFF has high bits set. if (a|b|c|d)&0xF0 != 0 { return 0, false } diff --git a/mergevalues.go b/mergevalues.go index 2b7c02e..eb830bd 100644 --- a/mergevalues.go +++ b/mergevalues.go @@ -108,6 +108,8 @@ func MergeValues(ar arena.Arena, a, b *Value) (v *Value, changed bool, err error if a.s == b.s { return a, false, nil } + // Slow path: parse as float64. If either parse fails or values differ, + // treat as changed (b replaces a). af, aErr := a.Float64() bf, bErr := b.Float64() if aErr != nil || bErr != nil || af != bf { diff --git a/parser.go b/parser.go index 67f1d89..11a7e0d 100644 --- a/parser.go +++ b/parser.go @@ -656,7 +656,7 @@ func parseRawNumber(s string) (string, string, error) { // Cache-friendly layout: hot data first type Object struct { kvs []*kv // HOT: frequently accessed - kvIndex map[string]int // lazily built on first Get when len(kvs) > 16 + kvIndex map[string]int // lazily-built reverse index for O(1) lookups on objects with >16 keys; invalidated on Del/Set } func (o *Object) reset() { @@ -735,7 +735,10 @@ func (o *Object) Get(key string) *Value { if o.kvIndex == nil { o.kvIndex = make(map[string]int, len(o.kvs)) for i, kv := range o.kvs { - o.kvIndex[kv.k] = i + // Store first occurrence to match linear scan semantics. + if _, exists := o.kvIndex[kv.k]; !exists { + o.kvIndex[kv.k] = i + } } } if i, ok := o.kvIndex[key]; ok { From 662d2d0eec382cfbc471b7d2b3c6111d05855586 Mon Sep 17 00:00:00 2001 From: Jens Neuse Date: Tue, 3 Mar 2026 00:06:36 +0100 Subject: [PATCH 3/4] test: add coverage for number merge with different representations Covers the path where two numbers have different raw strings but parse to the same float64 value (e.g., 1.0 vs 1.00). Brings statement coverage to 100%. Co-Authored-By: Claude Opus 4.6 --- mergevalues_test.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mergevalues_test.go b/mergevalues_test.go index 088fa96..73c96dc 100644 --- a/mergevalues_test.go +++ b/mergevalues_test.go @@ -111,6 +111,15 @@ func TestMergeValues(t *testing.T) { out := merged.MarshalTo(nil) require.Equal(t, `1.1`, string(out)) }) + t.Run("floats equal different representation", func(t *testing.T) { + t.Parallel() + a, b := MustParse(`1.0`), MustParse(`1.00`) + merged, changed, err := MergeValues(nil, a, b) + require.NoError(t, err) + require.Equal(t, false, changed) + out := merged.MarshalTo(nil) + require.Equal(t, `1.0`, string(out)) + }) t.Run("arrays", func(t *testing.T) { t.Parallel() a, b := MustParse(`[1,2]`), MustParse(`[3,4]`) From c07c8e6996aa51b9dd57a8aa84437565411dcd59 Mon Sep 17 00:00:00 2001 From: Jens Neuse Date: Tue, 3 Mar 2026 00:20:22 +0100 Subject: [PATCH 4/4] fix: benchmark error handling and SetBytes accuracy - Check parse errors in merge benchmark setup and loops instead of discarding them, so parser regressions surface as failures - Fix SetBytes in large merge benchmark to account for both objects being parsed per iteration - Add test for number merge with different float representations (1.0 vs 1.00) to reach 100% coverage Co-Authored-By: Claude Opus 4.6 --- benchmark_comprehensive_test.go | 42 ++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/benchmark_comprehensive_test.go b/benchmark_comprehensive_test.go index a4d517e..ca424f9 100644 --- a/benchmark_comprehensive_test.go +++ b/benchmark_comprehensive_test.go @@ -542,16 +542,28 @@ func BenchmarkSTMergeValuesObject(b *testing.B) { b.Run("small", func(b *testing.B) { a := arena.NewMonotonicArena(arena.WithMinBufferSize(4096)) var p Parser - aVal, _ := p.ParseWithArena(a, `{"x":1,"y":2,"z":3}`) - bVal, _ := p.ParseWithArena(a, `{"y":20,"w":4}`) + aVal, err := p.ParseWithArena(a, `{"x":1,"y":2,"z":3}`) + if err != nil { + b.Fatal(err) + } + bVal, err := p.ParseWithArena(a, `{"y":20,"w":4}`) + if err != nil { + b.Fatal(err) + } aBytes := []byte(aVal.String()) bBytes := []byte(bVal.String()) a.Reset() b.ReportAllocs() b.ResetTimer() for b.Loop() { - av, _ := p.ParseBytesWithArena(a, aBytes) - bv, _ := p.ParseBytesWithArena(a, bBytes) + av, err := p.ParseBytesWithArena(a, aBytes) + if err != nil { + b.Fatal(err) + } + bv, err := p.ParseBytesWithArena(a, bBytes) + if err != nil { + b.Fatal(err) + } v, _, err := MergeValues(a, av, bv) if err != nil { b.Fatal(err) @@ -571,8 +583,14 @@ func BenchmarkSTMergeValuesObject(b *testing.B) { b.ReportAllocs() b.ResetTimer() for b.Loop() { - av, _ := p.ParseBytesWithArena(a, aBytes) - bv, _ := p.ParseBytesWithArena(a, bBytes) + av, err := p.ParseBytesWithArena(a, aBytes) + if err != nil { + b.Fatal(err) + } + bv, err := p.ParseBytesWithArena(a, bBytes) + if err != nil { + b.Fatal(err) + } v, _, err := MergeValues(a, av, bv) if err != nil { b.Fatal(err) @@ -590,11 +608,17 @@ func BenchmarkSTMergeValuesObject(b *testing.B) { objJSON := obj.String() objBytes := []byte(objJSON) b.ReportAllocs() - b.SetBytes(int64(len(objBytes))) + b.SetBytes(int64(len(objBytes) * 2)) b.ResetTimer() for b.Loop() { - av, _ := p.ParseBytesWithArena(a, objBytes) - bv, _ := p.ParseBytesWithArena(a, objBytes) + av, err := p.ParseBytesWithArena(a, objBytes) + if err != nil { + b.Fatal(err) + } + bv, err := p.ParseBytesWithArena(a, objBytes) + if err != nil { + b.Fatal(err) + } v, _, err := MergeValues(a, av, bv) if err != nil { b.Fatal(err)