diff --git a/internal/json/bytes.go b/internal/json/bytes.go new file mode 100644 index 0000000..8f7d5fe --- /dev/null +++ b/internal/json/bytes.go @@ -0,0 +1,85 @@ +package json + +import "unicode/utf8" + +// AppendBytes is a mirror of appendString with []byte arg +func AppendBytes(dst, s []byte) []byte { + dst = append(dst, '"') + for i := 0; i < len(s); i++ { + if !noEscapeTable[s[i]] { + dst = appendBytesComplex(dst, s, i) + return append(dst, '"') + } + } + dst = append(dst, s...) + return append(dst, '"') +} + +// AppendHex encodes the input bytes to a hex string and appends +// the encoded string to the input byte slice. +// +// The operation loops though each byte and encodes it as hex using +// the hex lookup table. +func AppendHex(dst, s []byte) []byte { + dst = append(dst, '"') + for _, v := range s { + dst = append(dst, hex[v>>4], hex[v&0x0f]) + } + return append(dst, '"') +} + +// appendBytesComplex is a mirror of the appendStringComplex +// with []byte arg +func appendBytesComplex(dst, s []byte, i int) []byte { + start := 0 + for i < len(s) { + b := s[i] + if b >= utf8.RuneSelf { + r, size := utf8.DecodeRune(s[i:]) + if r == utf8.RuneError && size == 1 { + if start < i { + dst = append(dst, s[start:i]...) + } + dst = append(dst, `\ufffd`...) + i += size + start = i + continue + } + i += size + continue + } + if noEscapeTable[b] { + i++ + continue + } + // We encountered a character that needs to be encoded. + // Let's append the previous simple characters to the byte slice + // and switch our operation to read and encode the remainder + // characters byte-by-byte. + if start < i { + dst = append(dst, s[start:i]...) + } + switch b { + case '"', '\\': + dst = append(dst, '\\', b) + case '\b': + dst = append(dst, '\\', 'b') + case '\f': + dst = append(dst, '\\', 'f') + case '\n': + dst = append(dst, '\\', 'n') + case '\r': + dst = append(dst, '\\', 'r') + case '\t': + dst = append(dst, '\\', 't') + default: + dst = append(dst, '\\', 'u', '0', '0', hex[b>>4], hex[b&0xF]) + } + i++ + start = i + } + if start < len(s) { + dst = append(dst, s[start:]...) + } + return dst +} diff --git a/internal/json/bytes_test.go b/internal/json/bytes_test.go new file mode 100644 index 0000000..e33c1e0 --- /dev/null +++ b/internal/json/bytes_test.go @@ -0,0 +1,82 @@ +package json + +import ( + "testing" + "unicode" +) + +func TestAppendBytes(t *testing.T) { + for _, tt := range encodeStringTests { + b := AppendBytes([]byte{}, []byte(tt.in)) + if got, want := string(b), tt.out; got != want { + t.Errorf("appendBytes(%q) = %#q, want %#q", tt.in, got, want) + } + } +} + +func TestAppendHex(t *testing.T) { + for _, tt := range encodeHexTests { + b := AppendHex([]byte{}, []byte{tt.in}) + if got, want := string(b), tt.out; got != want { + t.Errorf("appendHex(%x) = %s, want %s", tt.in, got, want) + } + } +} + +func TestStringBytes(t *testing.T) { + t.Parallel() + // Test that encodeState.stringBytes and encodeState.string use the same encoding. + var r []rune + for i := '\u0000'; i <= unicode.MaxRune; i++ { + r = append(r, i) + } + s := string(r) + "\xff\xff\xffhello" // some invalid UTF-8 too + + enc := string(AppendString([]byte{}, s)) + encBytes := string(AppendBytes([]byte{}, []byte(s))) + + if enc != encBytes { + i := 0 + for i < len(enc) && i < len(encBytes) && enc[i] == encBytes[i] { + i++ + } + enc = enc[i:] + encBytes = encBytes[i:] + i = 0 + for i < len(enc) && i < len(encBytes) && enc[len(enc)-i-1] == encBytes[len(encBytes)-i-1] { + i++ + } + enc = enc[:len(enc)-i] + encBytes = encBytes[:len(encBytes)-i] + + if len(enc) > 20 { + enc = enc[:20] + "..." + } + if len(encBytes) > 20 { + encBytes = encBytes[:20] + "..." + } + + t.Errorf("encodings differ at %#q vs %#q", enc, encBytes) + } +} + +func BenchmarkAppendBytes(b *testing.B) { + tests := map[string]string{ + "NoEncoding": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, + "EncodingFirst": `"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, + "EncodingMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa"aaaaaaaaaaaaaaaaaaaaaaaa`, + "EncodingLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"`, + "MultiBytesFirst": `❤️aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, + "MultiBytesMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa❤️aaaaaaaaaaaaaaaaaaaaaaaa`, + "MultiBytesLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa❤️`, + } + for name, str := range tests { + byt := []byte(str) + b.Run(name, func(b *testing.B) { + buf := make([]byte, 0, 100) + for i := 0; i < b.N; i++ { + _ = AppendBytes(buf, byt) + } + }) + } +} diff --git a/internal/json/string.go b/internal/json/string.go index 7f85ad6..bb606f0 100644 --- a/internal/json/string.go +++ b/internal/json/string.go @@ -4,6 +4,14 @@ import "unicode/utf8" const hex = "0123456789abcdef" +var noEscapeTable = [256]bool{} + +func init() { + for i := 0; i <= 0x7e; i++ { + noEscapeTable[i] = i >= 0x20 && i != '\\' && i != '"' + } +} + // AppendStrings encodes the input strings to json and // appends the encoded string list to the input byte slice. func AppendStrings(dst []byte, vals []string) []byte { @@ -38,7 +46,7 @@ func AppendString(dst []byte, s string) []byte { // Check if the character needs encoding. Control characters, slashes, // and the double quote need json encoding. Bytes above the ascii // boundary needs utf8 encoding. - if s[i] < 0x20 || s[i] > 0x7e || s[i] == '\\' || s[i] == '"' { + if !noEscapeTable[s[i]] { // We encountered a character that needs to be encoded. Switch // to complex version of the algorithm. dst = appendStringComplex(dst, s, i) @@ -76,89 +84,7 @@ func appendStringComplex(dst []byte, s string, i int) []byte { i += size continue } - if b >= 0x20 && b <= 0x7e && b != '\\' && b != '"' { - i++ - continue - } - // We encountered a character that needs to be encoded. - // Let's append the previous simple characters to the byte slice - // and switch our operation to read and encode the remainder - // characters byte-by-byte. - if start < i { - dst = append(dst, s[start:i]...) - } - switch b { - case '"', '\\': - dst = append(dst, '\\', b) - case '\b': - dst = append(dst, '\\', 'b') - case '\f': - dst = append(dst, '\\', 'f') - case '\n': - dst = append(dst, '\\', 'n') - case '\r': - dst = append(dst, '\\', 'r') - case '\t': - dst = append(dst, '\\', 't') - default: - dst = append(dst, '\\', 'u', '0', '0', hex[b>>4], hex[b&0xF]) - } - i++ - start = i - } - if start < len(s) { - dst = append(dst, s[start:]...) - } - return dst -} - -// AppendBytes is a mirror of appendString with []byte arg -func AppendBytes(dst, s []byte) []byte { - dst = append(dst, '"') - for i := 0; i < len(s); i++ { - if s[i] < 0x20 || s[i] > 0x7e || s[i] == '\\' || s[i] == '"' { - dst = appendBytesComplex(dst, s, i) - return append(dst, '"') - } - } - dst = append(dst, s...) - return append(dst, '"') -} - -// AppendHex encodes the input bytes to a hex string and appends -// the encoded string to the input byte slice. -// -// The operation loops though each byte and encodes it as hex using -// the hex lookup table. -func AppendHex(dst, s []byte) []byte { - dst = append(dst, '"') - for _, v := range s { - dst = append(dst, hex[v>>4], hex[v&0x0f]) - } - return append(dst, '"') -} - -// appendBytesComplex is a mirror of the appendStringComplex -// with []byte arg -func appendBytesComplex(dst, s []byte, i int) []byte { - start := 0 - for i < len(s) { - b := s[i] - if b >= utf8.RuneSelf { - r, size := utf8.DecodeRune(s[i:]) - if r == utf8.RuneError && size == 1 { - if start < i { - dst = append(dst, s[start:i]...) - } - dst = append(dst, `\ufffd`...) - i += size - start = i - continue - } - i += size - continue - } - if b >= 0x20 && b <= 0x7e && b != '\\' && b != '"' { + if noEscapeTable[b] { i++ continue } diff --git a/internal/json/string_test.go b/internal/json/string_test.go index 0d5fc6c..a30b124 100644 --- a/internal/json/string_test.go +++ b/internal/json/string_test.go @@ -2,7 +2,6 @@ package json import ( "testing" - "unicode" ) var encodeStringTests = []struct { @@ -73,61 +72,6 @@ func TestAppendString(t *testing.T) { } } -func TestAppendBytes(t *testing.T) { - for _, tt := range encodeStringTests { - b := AppendBytes([]byte{}, []byte(tt.in)) - if got, want := string(b), tt.out; got != want { - t.Errorf("appendBytes(%q) = %#q, want %#q", tt.in, got, want) - } - } -} - -func TestAppendHex(t *testing.T) { - for _, tt := range encodeHexTests { - b := AppendHex([]byte{}, []byte{tt.in}) - if got, want := string(b), tt.out; got != want { - t.Errorf("appendHex(%x) = %s, want %s", tt.in, got, want) - } - } -} - -func TestStringBytes(t *testing.T) { - t.Parallel() - // Test that encodeState.stringBytes and encodeState.string use the same encoding. - var r []rune - for i := '\u0000'; i <= unicode.MaxRune; i++ { - r = append(r, i) - } - s := string(r) + "\xff\xff\xffhello" // some invalid UTF-8 too - - enc := string(AppendString([]byte{}, s)) - encBytes := string(AppendBytes([]byte{}, []byte(s))) - - if enc != encBytes { - i := 0 - for i < len(enc) && i < len(encBytes) && enc[i] == encBytes[i] { - i++ - } - enc = enc[i:] - encBytes = encBytes[i:] - i = 0 - for i < len(enc) && i < len(encBytes) && enc[len(enc)-i-1] == encBytes[len(encBytes)-i-1] { - i++ - } - enc = enc[:len(enc)-i] - encBytes = encBytes[:len(encBytes)-i] - - if len(enc) > 20 { - enc = enc[:20] + "..." - } - if len(encBytes) > 20 { - encBytes = encBytes[:20] + "..." - } - - t.Errorf("encodings differ at %#q vs %#q", enc, encBytes) - } -} - func BenchmarkAppendString(b *testing.B) { tests := map[string]string{ "NoEncoding": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, @@ -147,24 +91,3 @@ func BenchmarkAppendString(b *testing.B) { }) } } - -func BenchmarkAppendBytes(b *testing.B) { - tests := map[string]string{ - "NoEncoding": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, - "EncodingFirst": `"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, - "EncodingMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa"aaaaaaaaaaaaaaaaaaaaaaaa`, - "EncodingLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"`, - "MultiBytesFirst": `❤️aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`, - "MultiBytesMiddle": `aaaaaaaaaaaaaaaaaaaaaaaaa❤️aaaaaaaaaaaaaaaaaaaaaaaa`, - "MultiBytesLast": `aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa❤️`, - } - for name, str := range tests { - byt := []byte(str) - b.Run(name, func(b *testing.B) { - buf := make([]byte, 0, 100) - for i := 0; i < b.N; i++ { - _ = AppendBytes(buf, byt) - } - }) - } -}