Optimize JSON string encoding using a lookup table

benchstat old new
name                             old time/op    new time/op    delta
AppendString/MultiBytesFirst-8     77.9ns ± 5%    70.2ns ± 1%   -9.88%  (p=0.008 n=5+5)
AppendString/MultiBytesMiddle-8    64.2ns ± 1%    56.3ns ± 5%  -12.19%  (p=0.008 n=5+5)
AppendString/MultiBytesLast-8      51.2ns ± 2%    45.2ns ± 4%  -11.65%  (p=0.008 n=5+5)
AppendString/NoEncoding-8          36.2ns ± 4%    34.0ns ± 6%     ~     (p=0.087 n=5+5)
AppendString/EncodingFirst-8       67.7ns ± 2%    59.4ns ± 2%  -12.26%  (p=0.008 n=5+5)
AppendString/EncodingMiddle-8      56.5ns ± 2%    50.6ns ± 5%  -10.54%  (p=0.008 n=5+5)
AppendString/EncodingLast-8        41.3ns ± 1%    39.6ns ± 5%   -4.11%  (p=0.024 n=5+5)
AppendBytes/MultiBytesLast-8       53.5ns ± 6%    45.6ns ± 4%  -14.79%  (p=0.008 n=5+5)
AppendBytes/NoEncoding-8           36.3ns ± 3%    28.6ns ± 3%  -21.10%  (p=0.008 n=5+5)
AppendBytes/EncodingFirst-8        67.3ns ± 4%    62.1ns ± 4%   -7.75%  (p=0.008 n=5+5)
AppendBytes/EncodingMiddle-8       59.2ns ± 7%    51.0ns ± 6%  -13.85%  (p=0.008 n=5+5)
AppendBytes/EncodingLast-8         43.7ns ± 6%    34.4ns ± 2%  -21.32%  (p=0.008 n=5+5)
AppendBytes/MultiBytesFirst-8      77.7ns ± 2%    71.2ns ± 3%   -8.37%  (p=0.008 n=5+5)
AppendBytes/MultiBytesMiddle-8     63.6ns ± 3%    57.8ns ± 5%   -9.12%  (p=0.008 n=5+5)
This commit is contained in:
Olivier Poitrey 2018-03-23 02:45:05 -07:00
parent 5250a1ba2d
commit 4ea03de40d
4 changed files with 177 additions and 161 deletions

85
internal/json/bytes.go Normal file
View File

@ -0,0 +1,85 @@
package json
import "unicode/utf8"
// AppendBytes is a mirror of appendString with []byte arg
func AppendBytes(dst, s []byte) []byte {
dst = append(dst, '"')
for i := 0; i < len(s); i++ {
if !noEscapeTable[s[i]] {
dst = appendBytesComplex(dst, s, i)
return append(dst, '"')
}
}
dst = append(dst, s...)
return append(dst, '"')
}
// AppendHex encodes the input bytes to a hex string and appends
// the encoded string to the input byte slice.
//
// The operation loops though each byte and encodes it as hex using
// the hex lookup table.
func AppendHex(dst, s []byte) []byte {
dst = append(dst, '"')
for _, v := range s {
dst = append(dst, hex[v>>4], hex[v&0x0f])
}
return append(dst, '"')
}
// appendBytesComplex is a mirror of the appendStringComplex
// with []byte arg
func appendBytesComplex(dst, s []byte, i int) []byte {
start := 0
for i < len(s) {
b := s[i]
if b >= utf8.RuneSelf {
r, size := utf8.DecodeRune(s[i:])
if r == utf8.RuneError && size == 1 {
if start < i {
dst = append(dst, s[start:i]...)
}
dst = append(dst, `\ufffd`...)
i += size
start = i
continue
}
i += size
continue
}
if noEscapeTable[b] {
i++
continue
}
// We encountered a character that needs to be encoded.
// Let's append the previous simple characters to the byte slice
// and switch our operation to read and encode the remainder
// characters byte-by-byte.
if start < i {
dst = append(dst, s[start:i]...)
}
switch b {
case '"', '\\':
dst = append(dst, '\\', b)
case '\b':
dst = append(dst, '\\', 'b')
case '\f':
dst = append(dst, '\\', 'f')
case '\n':
dst = append(dst, '\\', 'n')
case '\r':
dst = append(dst, '\\', 'r')
case '\t':
dst = append(dst, '\\', 't')
default:
dst = append(dst, '\\', 'u', '0', '0', hex[b>>4], hex[b&0xF])
}
i++
start = i
}
if start < len(s) {
dst = append(dst, s[start:]...)
}
return dst
}

View File

@ -0,0 +1,82 @@
package json
import (
"testing"
"unicode"
)
func TestAppendBytes(t *testing.T) {
for _, tt := range encodeStringTests {
b := AppendBytes([]byte{}, []byte(tt.in))
if got, want := string(b), tt.out; got != want {
t.Errorf("appendBytes(%q) = %#q, want %#q", tt.in, got, want)
}
}
}
func TestAppendHex(t *testing.T) {
for _, tt := range encodeHexTests {
b := AppendHex([]byte{}, []byte{tt.in})
if got, want := string(b), tt.out; got != want {
t.Errorf("appendHex(%x) = %s, want %s", tt.in, got, want)
}
}
}
func TestStringBytes(t *testing.T) {
t.Parallel()
// Test that encodeState.stringBytes and encodeState.string use the same encoding.
var r []rune
for i := '\u0000'; i <= unicode.MaxRune; i++ {
r = append(r, i)
}
s := string(r) + "\xff\xff\xffhello" // some invalid UTF-8 too
enc := string(AppendString([]byte{}, s))
encBytes := string(AppendBytes([]byte{}, []byte(s)))
if enc != encBytes {
i := 0
for i < len(enc) && i < len(encBytes) && enc[i] == encBytes[i] {
i++
}
enc = enc[i:]
encBytes = encBytes[i:]
i = 0
for i < len(enc) && i < len(encBytes) && enc[len(enc)-i-1] == encBytes[len(encBytes)-i-1] {
i++
}
enc = enc[:len(enc)-i]
encBytes = encBytes[:len(encBytes)-i]
if len(enc) > 20 {
enc = enc[:20] + "..."
}
if len(encBytes) > 20 {
encBytes = encBytes[:20] + "..."
}
t.Errorf("encodings differ at %#q vs %#q", enc, encBytes)
}
}
func BenchmarkAppendBytes(b *testing.B) {
tests := map[string]string{
"NoEncoding": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`,
"EncodingFirst": `"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`,
"EncodingMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa"aaaaaaaaaaaaaaaaaaaaaaaa`,
"EncodingLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"`,
"MultiBytesFirst": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`,
"MultiBytesMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa❤aaaaaaaaaaaaaaaaaaaaaaaa`,
"MultiBytesLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa❤`,
}
for name, str := range tests {
byt := []byte(str)
b.Run(name, func(b *testing.B) {
buf := make([]byte, 0, 100)
for i := 0; i < b.N; i++ {
_ = AppendBytes(buf, byt)
}
})
}
}

View File

@ -4,6 +4,14 @@ import "unicode/utf8"
const hex = "0123456789abcdef" const hex = "0123456789abcdef"
var noEscapeTable = [256]bool{}
func init() {
for i := 0; i <= 0x7e; i++ {
noEscapeTable[i] = i >= 0x20 && i != '\\' && i != '"'
}
}
// AppendStrings encodes the input strings to json and // AppendStrings encodes the input strings to json and
// appends the encoded string list to the input byte slice. // appends the encoded string list to the input byte slice.
func AppendStrings(dst []byte, vals []string) []byte { func AppendStrings(dst []byte, vals []string) []byte {
@ -38,7 +46,7 @@ func AppendString(dst []byte, s string) []byte {
// Check if the character needs encoding. Control characters, slashes, // Check if the character needs encoding. Control characters, slashes,
// and the double quote need json encoding. Bytes above the ascii // and the double quote need json encoding. Bytes above the ascii
// boundary needs utf8 encoding. // boundary needs utf8 encoding.
if s[i] < 0x20 || s[i] > 0x7e || s[i] == '\\' || s[i] == '"' { if !noEscapeTable[s[i]] {
// We encountered a character that needs to be encoded. Switch // We encountered a character that needs to be encoded. Switch
// to complex version of the algorithm. // to complex version of the algorithm.
dst = appendStringComplex(dst, s, i) dst = appendStringComplex(dst, s, i)
@ -76,89 +84,7 @@ func appendStringComplex(dst []byte, s string, i int) []byte {
i += size i += size
continue continue
} }
if b >= 0x20 && b <= 0x7e && b != '\\' && b != '"' { if noEscapeTable[b] {
i++
continue
}
// We encountered a character that needs to be encoded.
// Let's append the previous simple characters to the byte slice
// and switch our operation to read and encode the remainder
// characters byte-by-byte.
if start < i {
dst = append(dst, s[start:i]...)
}
switch b {
case '"', '\\':
dst = append(dst, '\\', b)
case '\b':
dst = append(dst, '\\', 'b')
case '\f':
dst = append(dst, '\\', 'f')
case '\n':
dst = append(dst, '\\', 'n')
case '\r':
dst = append(dst, '\\', 'r')
case '\t':
dst = append(dst, '\\', 't')
default:
dst = append(dst, '\\', 'u', '0', '0', hex[b>>4], hex[b&0xF])
}
i++
start = i
}
if start < len(s) {
dst = append(dst, s[start:]...)
}
return dst
}
// AppendBytes is a mirror of appendString with []byte arg
func AppendBytes(dst, s []byte) []byte {
dst = append(dst, '"')
for i := 0; i < len(s); i++ {
if s[i] < 0x20 || s[i] > 0x7e || s[i] == '\\' || s[i] == '"' {
dst = appendBytesComplex(dst, s, i)
return append(dst, '"')
}
}
dst = append(dst, s...)
return append(dst, '"')
}
// AppendHex encodes the input bytes to a hex string and appends
// the encoded string to the input byte slice.
//
// The operation loops though each byte and encodes it as hex using
// the hex lookup table.
func AppendHex(dst, s []byte) []byte {
dst = append(dst, '"')
for _, v := range s {
dst = append(dst, hex[v>>4], hex[v&0x0f])
}
return append(dst, '"')
}
// appendBytesComplex is a mirror of the appendStringComplex
// with []byte arg
func appendBytesComplex(dst, s []byte, i int) []byte {
start := 0
for i < len(s) {
b := s[i]
if b >= utf8.RuneSelf {
r, size := utf8.DecodeRune(s[i:])
if r == utf8.RuneError && size == 1 {
if start < i {
dst = append(dst, s[start:i]...)
}
dst = append(dst, `\ufffd`...)
i += size
start = i
continue
}
i += size
continue
}
if b >= 0x20 && b <= 0x7e && b != '\\' && b != '"' {
i++ i++
continue continue
} }

View File

@ -2,7 +2,6 @@ package json
import ( import (
"testing" "testing"
"unicode"
) )
var encodeStringTests = []struct { var encodeStringTests = []struct {
@ -73,61 +72,6 @@ func TestAppendString(t *testing.T) {
} }
} }
func TestAppendBytes(t *testing.T) {
for _, tt := range encodeStringTests {
b := AppendBytes([]byte{}, []byte(tt.in))
if got, want := string(b), tt.out; got != want {
t.Errorf("appendBytes(%q) = %#q, want %#q", tt.in, got, want)
}
}
}
func TestAppendHex(t *testing.T) {
for _, tt := range encodeHexTests {
b := AppendHex([]byte{}, []byte{tt.in})
if got, want := string(b), tt.out; got != want {
t.Errorf("appendHex(%x) = %s, want %s", tt.in, got, want)
}
}
}
func TestStringBytes(t *testing.T) {
t.Parallel()
// Test that encodeState.stringBytes and encodeState.string use the same encoding.
var r []rune
for i := '\u0000'; i <= unicode.MaxRune; i++ {
r = append(r, i)
}
s := string(r) + "\xff\xff\xffhello" // some invalid UTF-8 too
enc := string(AppendString([]byte{}, s))
encBytes := string(AppendBytes([]byte{}, []byte(s)))
if enc != encBytes {
i := 0
for i < len(enc) && i < len(encBytes) && enc[i] == encBytes[i] {
i++
}
enc = enc[i:]
encBytes = encBytes[i:]
i = 0
for i < len(enc) && i < len(encBytes) && enc[len(enc)-i-1] == encBytes[len(encBytes)-i-1] {
i++
}
enc = enc[:len(enc)-i]
encBytes = encBytes[:len(encBytes)-i]
if len(enc) > 20 {
enc = enc[:20] + "..."
}
if len(encBytes) > 20 {
encBytes = encBytes[:20] + "..."
}
t.Errorf("encodings differ at %#q vs %#q", enc, encBytes)
}
}
func BenchmarkAppendString(b *testing.B) { func BenchmarkAppendString(b *testing.B) {
tests := map[string]string{ tests := map[string]string{
"NoEncoding": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, "NoEncoding": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`,
@ -147,24 +91,3 @@ func BenchmarkAppendString(b *testing.B) {
}) })
} }
} }
func BenchmarkAppendBytes(b *testing.B) {
tests := map[string]string{
"NoEncoding": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`,
"EncodingFirst": `"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`,
"EncodingMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa"aaaaaaaaaaaaaaaaaaaaaaaa`,
"EncodingLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"`,
"MultiBytesFirst": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`,
"MultiBytesMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa❤aaaaaaaaaaaaaaaaaaaaaaaa`,
"MultiBytesLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa❤`,
}
for name, str := range tests {
byt := []byte(str)
b.Run(name, func(b *testing.B) {
buf := make([]byte, 0, 100)
for i := 0; i < b.N; i++ {
_ = AppendBytes(buf, byt)
}
})
}
}