From 4ea03de40d7ee400dfb3e97488df8f00b2a363b0 Mon Sep 17 00:00:00 2001 From: Olivier Poitrey Date: Fri, 23 Mar 2018 02:45:05 -0700 Subject: [PATCH] Optimize JSON string encoding using a lookup table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit benchstat old new name old time/op new time/op delta AppendString/MultiBytesFirst-8 77.9ns ± 5% 70.2ns ± 1% -9.88% (p=0.008 n=5+5) AppendString/MultiBytesMiddle-8 64.2ns ± 1% 56.3ns ± 5% -12.19% (p=0.008 n=5+5) AppendString/MultiBytesLast-8 51.2ns ± 2% 45.2ns ± 4% -11.65% (p=0.008 n=5+5) AppendString/NoEncoding-8 36.2ns ± 4% 34.0ns ± 6% ~ (p=0.087 n=5+5) AppendString/EncodingFirst-8 67.7ns ± 2% 59.4ns ± 2% -12.26% (p=0.008 n=5+5) AppendString/EncodingMiddle-8 56.5ns ± 2% 50.6ns ± 5% -10.54% (p=0.008 n=5+5) AppendString/EncodingLast-8 41.3ns ± 1% 39.6ns ± 5% -4.11% (p=0.024 n=5+5) AppendBytes/MultiBytesLast-8 53.5ns ± 6% 45.6ns ± 4% -14.79% (p=0.008 n=5+5) AppendBytes/NoEncoding-8 36.3ns ± 3% 28.6ns ± 3% -21.10% (p=0.008 n=5+5) AppendBytes/EncodingFirst-8 67.3ns ± 4% 62.1ns ± 4% -7.75% (p=0.008 n=5+5) AppendBytes/EncodingMiddle-8 59.2ns ± 7% 51.0ns ± 6% -13.85% (p=0.008 n=5+5) AppendBytes/EncodingLast-8 43.7ns ± 6% 34.4ns ± 2% -21.32% (p=0.008 n=5+5) AppendBytes/MultiBytesFirst-8 77.7ns ± 2% 71.2ns ± 3% -8.37% (p=0.008 n=5+5) AppendBytes/MultiBytesMiddle-8 63.6ns ± 3% 57.8ns ± 5% -9.12% (p=0.008 n=5+5) --- internal/json/bytes.go | 85 ++++++++++++++++++++++++++++++++ internal/json/bytes_test.go | 82 +++++++++++++++++++++++++++++++ internal/json/string.go | 94 ++++-------------------------------- internal/json/string_test.go | 77 ----------------------------- 4 files changed, 177 insertions(+), 161 deletions(-) create mode 100644 internal/json/bytes.go create mode 100644 internal/json/bytes_test.go diff --git a/internal/json/bytes.go b/internal/json/bytes.go new file mode 100644 index 0000000..8f7d5fe --- /dev/null +++ b/internal/json/bytes.go @@ -0,0 +1,85 @@ +package json + +import "unicode/utf8" + +// AppendBytes is a mirror of appendString with []byte arg +func AppendBytes(dst, s []byte) []byte { + dst = append(dst, '"') + for i := 0; i < len(s); i++ { + if !noEscapeTable[s[i]] { + dst = appendBytesComplex(dst, s, i) + return append(dst, '"') + } + } + dst = append(dst, s...) + return append(dst, '"') +} + +// AppendHex encodes the input bytes to a hex string and appends +// the encoded string to the input byte slice. +// +// The operation loops though each byte and encodes it as hex using +// the hex lookup table. +func AppendHex(dst, s []byte) []byte { + dst = append(dst, '"') + for _, v := range s { + dst = append(dst, hex[v>>4], hex[v&0x0f]) + } + return append(dst, '"') +} + +// appendBytesComplex is a mirror of the appendStringComplex +// with []byte arg +func appendBytesComplex(dst, s []byte, i int) []byte { + start := 0 + for i < len(s) { + b := s[i] + if b >= utf8.RuneSelf { + r, size := utf8.DecodeRune(s[i:]) + if r == utf8.RuneError && size == 1 { + if start < i { + dst = append(dst, s[start:i]...) + } + dst = append(dst, `\ufffd`...) + i += size + start = i + continue + } + i += size + continue + } + if noEscapeTable[b] { + i++ + continue + } + // We encountered a character that needs to be encoded. + // Let's append the previous simple characters to the byte slice + // and switch our operation to read and encode the remainder + // characters byte-by-byte. + if start < i { + dst = append(dst, s[start:i]...) + } + switch b { + case '"', '\\': + dst = append(dst, '\\', b) + case '\b': + dst = append(dst, '\\', 'b') + case '\f': + dst = append(dst, '\\', 'f') + case '\n': + dst = append(dst, '\\', 'n') + case '\r': + dst = append(dst, '\\', 'r') + case '\t': + dst = append(dst, '\\', 't') + default: + dst = append(dst, '\\', 'u', '0', '0', hex[b>>4], hex[b&0xF]) + } + i++ + start = i + } + if start < len(s) { + dst = append(dst, s[start:]...) + } + return dst +} diff --git a/internal/json/bytes_test.go b/internal/json/bytes_test.go new file mode 100644 index 0000000..e33c1e0 --- /dev/null +++ b/internal/json/bytes_test.go @@ -0,0 +1,82 @@ +package json + +import ( + "testing" + "unicode" +) + +func TestAppendBytes(t *testing.T) { + for _, tt := range encodeStringTests { + b := AppendBytes([]byte{}, []byte(tt.in)) + if got, want := string(b), tt.out; got != want { + t.Errorf("appendBytes(%q) = %#q, want %#q", tt.in, got, want) + } + } +} + +func TestAppendHex(t *testing.T) { + for _, tt := range encodeHexTests { + b := AppendHex([]byte{}, []byte{tt.in}) + if got, want := string(b), tt.out; got != want { + t.Errorf("appendHex(%x) = %s, want %s", tt.in, got, want) + } + } +} + +func TestStringBytes(t *testing.T) { + t.Parallel() + // Test that encodeState.stringBytes and encodeState.string use the same encoding. + var r []rune + for i := '\u0000'; i <= unicode.MaxRune; i++ { + r = append(r, i) + } + s := string(r) + "\xff\xff\xffhello" // some invalid UTF-8 too + + enc := string(AppendString([]byte{}, s)) + encBytes := string(AppendBytes([]byte{}, []byte(s))) + + if enc != encBytes { + i := 0 + for i < len(enc) && i < len(encBytes) && enc[i] == encBytes[i] { + i++ + } + enc = enc[i:] + encBytes = encBytes[i:] + i = 0 + for i < len(enc) && i < len(encBytes) && enc[len(enc)-i-1] == encBytes[len(encBytes)-i-1] { + i++ + } + enc = enc[:len(enc)-i] + encBytes = encBytes[:len(encBytes)-i] + + if len(enc) > 20 { + enc = enc[:20] + "..." + } + if len(encBytes) > 20 { + encBytes = encBytes[:20] + "..." + } + + t.Errorf("encodings differ at %#q vs %#q", enc, encBytes) + } +} + +func BenchmarkAppendBytes(b *testing.B) { + tests := map[string]string{ + "NoEncoding": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, + "EncodingFirst": `"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, + "EncodingMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa"aaaaaaaaaaaaaaaaaaaaaaaa`, + "EncodingLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"`, + "MultiBytesFirst": `❤️aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, + "MultiBytesMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa❤️aaaaaaaaaaaaaaaaaaaaaaaa`, + "MultiBytesLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa❤️`, + } + for name, str := range tests { + byt := []byte(str) + b.Run(name, func(b *testing.B) { + buf := make([]byte, 0, 100) + for i := 0; i < b.N; i++ { + _ = AppendBytes(buf, byt) + } + }) + } +} diff --git a/internal/json/string.go b/internal/json/string.go index 7f85ad6..bb606f0 100644 --- a/internal/json/string.go +++ b/internal/json/string.go @@ -4,6 +4,14 @@ import "unicode/utf8" const hex = "0123456789abcdef" +var noEscapeTable = [256]bool{} + +func init() { + for i := 0; i <= 0x7e; i++ { + noEscapeTable[i] = i >= 0x20 && i != '\\' && i != '"' + } +} + // AppendStrings encodes the input strings to json and // appends the encoded string list to the input byte slice. func AppendStrings(dst []byte, vals []string) []byte { @@ -38,7 +46,7 @@ func AppendString(dst []byte, s string) []byte { // Check if the character needs encoding. Control characters, slashes, // and the double quote need json encoding. Bytes above the ascii // boundary needs utf8 encoding. - if s[i] < 0x20 || s[i] > 0x7e || s[i] == '\\' || s[i] == '"' { + if !noEscapeTable[s[i]] { // We encountered a character that needs to be encoded. Switch // to complex version of the algorithm. dst = appendStringComplex(dst, s, i) @@ -76,89 +84,7 @@ func appendStringComplex(dst []byte, s string, i int) []byte { i += size continue } - if b >= 0x20 && b <= 0x7e && b != '\\' && b != '"' { - i++ - continue - } - // We encountered a character that needs to be encoded. - // Let's append the previous simple characters to the byte slice - // and switch our operation to read and encode the remainder - // characters byte-by-byte. - if start < i { - dst = append(dst, s[start:i]...) - } - switch b { - case '"', '\\': - dst = append(dst, '\\', b) - case '\b': - dst = append(dst, '\\', 'b') - case '\f': - dst = append(dst, '\\', 'f') - case '\n': - dst = append(dst, '\\', 'n') - case '\r': - dst = append(dst, '\\', 'r') - case '\t': - dst = append(dst, '\\', 't') - default: - dst = append(dst, '\\', 'u', '0', '0', hex[b>>4], hex[b&0xF]) - } - i++ - start = i - } - if start < len(s) { - dst = append(dst, s[start:]...) - } - return dst -} - -// AppendBytes is a mirror of appendString with []byte arg -func AppendBytes(dst, s []byte) []byte { - dst = append(dst, '"') - for i := 0; i < len(s); i++ { - if s[i] < 0x20 || s[i] > 0x7e || s[i] == '\\' || s[i] == '"' { - dst = appendBytesComplex(dst, s, i) - return append(dst, '"') - } - } - dst = append(dst, s...) - return append(dst, '"') -} - -// AppendHex encodes the input bytes to a hex string and appends -// the encoded string to the input byte slice. -// -// The operation loops though each byte and encodes it as hex using -// the hex lookup table. -func AppendHex(dst, s []byte) []byte { - dst = append(dst, '"') - for _, v := range s { - dst = append(dst, hex[v>>4], hex[v&0x0f]) - } - return append(dst, '"') -} - -// appendBytesComplex is a mirror of the appendStringComplex -// with []byte arg -func appendBytesComplex(dst, s []byte, i int) []byte { - start := 0 - for i < len(s) { - b := s[i] - if b >= utf8.RuneSelf { - r, size := utf8.DecodeRune(s[i:]) - if r == utf8.RuneError && size == 1 { - if start < i { - dst = append(dst, s[start:i]...) - } - dst = append(dst, `\ufffd`...) - i += size - start = i - continue - } - i += size - continue - } - if b >= 0x20 && b <= 0x7e && b != '\\' && b != '"' { + if noEscapeTable[b] { i++ continue } diff --git a/internal/json/string_test.go b/internal/json/string_test.go index 0d5fc6c..a30b124 100644 --- a/internal/json/string_test.go +++ b/internal/json/string_test.go @@ -2,7 +2,6 @@ package json import ( "testing" - "unicode" ) var encodeStringTests = []struct { @@ -73,61 +72,6 @@ func TestAppendString(t *testing.T) { } } -func TestAppendBytes(t *testing.T) { - for _, tt := range encodeStringTests { - b := AppendBytes([]byte{}, []byte(tt.in)) - if got, want := string(b), tt.out; got != want { - t.Errorf("appendBytes(%q) = %#q, want %#q", tt.in, got, want) - } - } -} - -func TestAppendHex(t *testing.T) { - for _, tt := range encodeHexTests { - b := AppendHex([]byte{}, []byte{tt.in}) - if got, want := string(b), tt.out; got != want { - t.Errorf("appendHex(%x) = %s, want %s", tt.in, got, want) - } - } -} - -func TestStringBytes(t *testing.T) { - t.Parallel() - // Test that encodeState.stringBytes and encodeState.string use the same encoding. - var r []rune - for i := '\u0000'; i <= unicode.MaxRune; i++ { - r = append(r, i) - } - s := string(r) + "\xff\xff\xffhello" // some invalid UTF-8 too - - enc := string(AppendString([]byte{}, s)) - encBytes := string(AppendBytes([]byte{}, []byte(s))) - - if enc != encBytes { - i := 0 - for i < len(enc) && i < len(encBytes) && enc[i] == encBytes[i] { - i++ - } - enc = enc[i:] - encBytes = encBytes[i:] - i = 0 - for i < len(enc) && i < len(encBytes) && enc[len(enc)-i-1] == encBytes[len(encBytes)-i-1] { - i++ - } - enc = enc[:len(enc)-i] - encBytes = encBytes[:len(encBytes)-i] - - if len(enc) > 20 { - enc = enc[:20] + "..." - } - if len(encBytes) > 20 { - encBytes = encBytes[:20] + "..." - } - - t.Errorf("encodings differ at %#q vs %#q", enc, encBytes) - } -} - func BenchmarkAppendString(b *testing.B) { tests := map[string]string{ "NoEncoding": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, @@ -147,24 +91,3 @@ func BenchmarkAppendString(b *testing.B) { }) } } - -func BenchmarkAppendBytes(b *testing.B) { - tests := map[string]string{ - "NoEncoding": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, - "EncodingFirst": `"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, - "EncodingMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa"aaaaaaaaaaaaaaaaaaaaaaaa`, - "EncodingLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"`, - "MultiBytesFirst": `❤️aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, - "MultiBytesMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa❤️aaaaaaaaaaaaaaaaaaaaaaaa`, - "MultiBytesLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa❤️`, - } - for name, str := range tests { - byt := []byte(str) - b.Run(name, func(b *testing.B) { - buf := make([]byte, 0, 100) - for i := 0; i < b.N; i++ { - _ = AppendBytes(buf, byt) - } - }) - } -}