// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version // 2.0 (the "License"); you may not use this file except in compliance with the // License. You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. package goavro import ( "encoding/hex" "errors" "fmt" "io" "os" "unicode" "unicode/utf16" "unicode/utf8" ) //////////////////////////////////////// // Binary Decode //////////////////////////////////////// func bytesNativeFromBinary(buf []byte) (interface{}, []byte, error) { if len(buf) < 1 { return nil, nil, fmt.Errorf("cannot decode binary bytes: %s", io.ErrShortBuffer) } var decoded interface{} var err error if decoded, buf, err = longNativeFromBinary(buf); err != nil { return nil, nil, fmt.Errorf("cannot decode binary bytes: %s", err) } size := decoded.(int64) // always returns int64 if size < 0 { return nil, nil, fmt.Errorf("cannot decode binary bytes: negative size: %d", size) } if size > int64(len(buf)) { return nil, nil, fmt.Errorf("cannot decode binary bytes: %s", io.ErrShortBuffer) } return buf[:size], buf[size:], nil } func stringNativeFromBinary(buf []byte) (interface{}, []byte, error) { d, b, err := bytesNativeFromBinary(buf) if err != nil { return nil, nil, fmt.Errorf("cannot decode binary string: %s", err) } return string(d.([]byte)), b, nil } //////////////////////////////////////// // Binary Encode //////////////////////////////////////// func bytesBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) { var someBytes []byte switch d := datum.(type) { case []byte: someBytes = d case string: someBytes = []byte(d) default: return nil, fmt.Errorf("cannot encode binary bytes: expected: []byte or string; received: %T", datum) } buf, _ = longBinaryFromNative(buf, len(someBytes)) // only fails when given non integer return append(buf, someBytes...), nil // append datum bytes } func stringBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) { var someBytes []byte switch d := datum.(type) { case []byte: someBytes = d case string: someBytes = []byte(d) default: return nil, fmt.Errorf("cannot encode binary bytes: expected: string; received: %T", datum) } buf, _ = longBinaryFromNative(buf, len(someBytes)) // only fails when given non integer return append(buf, someBytes...), nil // append datum bytes } //////////////////////////////////////// // Text Decode //////////////////////////////////////// func bytesNativeFromTextual(buf []byte) (interface{}, []byte, error) { buflen := len(buf) if buflen < 2 { return nil, nil, fmt.Errorf("cannot decode textual bytes: %s", io.ErrShortBuffer) } if buf[0] != '"' { return nil, nil, fmt.Errorf("cannot decode textual bytes: expected initial \"; found: %#U", buf[0]) } var newBytes []byte var escaped bool // Loop through bytes following initial double quote, but note we will // return immediately when find unescaped double quote. for i := 1; i < buflen; i++ { b := buf[i] if escaped { escaped = false if b2, ok := unescapeSpecialJSON(b); ok { newBytes = append(newBytes, b2) continue } if b == 'u' { // NOTE: Need at least 4 more bytes to read uint16, but subtract // 1 because do not want to count the trailing quote and // subtract another 1 because already consumed u but have yet to // increment i. if i > buflen-6 { return nil, nil, fmt.Errorf("cannot decode textual bytes: %s", io.ErrShortBuffer) } // NOTE: Avro bytes represent binary data, and do not // necessarily represent text. Therefore, Avro bytes are not // encoded in UTF-16. Each \u is followed by 4 hexadecimal // digits, the first and second of which must be 0. v, err := parseUint64FromHexSlice(buf[i+3 : i+5]) if err != nil { return nil, nil, fmt.Errorf("cannot decode textual bytes: %s", err) } i += 4 // absorb 4 characters: one 'u' and three of the digits newBytes = append(newBytes, byte(v)) continue } newBytes = append(newBytes, b) continue } if b == '\\' { escaped = true continue } if b == '"' { return newBytes, buf[i+1:], nil } newBytes = append(newBytes, b) } return nil, nil, fmt.Errorf("cannot decode textual bytes: expected final \"; found: %#U", buf[buflen-1]) } func stringNativeFromTextual(buf []byte) (interface{}, []byte, error) { buflen := len(buf) if buflen < 2 { return nil, nil, fmt.Errorf("cannot decode textual string: %s", io.ErrShortBuffer) } if buf[0] != '"' { return nil, nil, fmt.Errorf("cannot decode textual string: expected initial \"; found: %#U", buf[0]) } var newBytes []byte var escaped bool // Loop through bytes following initial double quote, but note we will // return immediately when find unescaped double quote. for i := 1; i < buflen; i++ { b := buf[i] if escaped { escaped = false if b2, ok := unescapeSpecialJSON(b); ok { newBytes = append(newBytes, b2) continue } if b == 'u' { // NOTE: Need at least 4 more bytes to read uint16, but subtract // 1 because do not want to count the trailing quote and // subtract another 1 because already consumed u but have yet to // increment i. if i > buflen-6 { return nil, nil, fmt.Errorf("cannot decode textual string: %s", io.ErrShortBuffer) } v, err := parseUint64FromHexSlice(buf[i+1 : i+5]) if err != nil { return nil, nil, fmt.Errorf("cannot decode textual string: %s", err) } i += 4 // absorb 4 characters: one 'u' and three of the digits nbl := len(newBytes) newBytes = append(newBytes, []byte{0, 0, 0, 0}...) // grow to make room for UTF-8 encoded rune r := rune(v) if utf16.IsSurrogate(r) { i++ // absorb final hexadecimal digit from previous value // Expect second half of surrogate pair if i > buflen-6 || buf[i] != '\\' || buf[i+1] != 'u' { return nil, nil, errors.New("cannot decode textual string: missing second half of surrogate pair") } v, err = parseUint64FromHexSlice(buf[i+2 : i+6]) if err != nil { return nil, nil, fmt.Errorf("cannot decode textual string: %s", err) } i += 5 // absorb 5 characters: two for '\u', and 3 of the 4 digits // Get code point by combining high and low surrogate bits r = utf16.DecodeRune(r, rune(v)) } width := utf8.EncodeRune(newBytes[nbl:], r) // append UTF-8 encoded version of code point newBytes = newBytes[:nbl+width] // trim off excess bytes continue } newBytes = append(newBytes, b) continue } if b == '\\' { escaped = true continue } if b == '"' { return string(newBytes), buf[i+1:], nil } newBytes = append(newBytes, b) } if escaped { return nil, nil, fmt.Errorf("cannot decode textual string: %s", io.ErrShortBuffer) } return nil, nil, fmt.Errorf("cannot decode textual string: expected final \"; found: %x", buf[buflen-1]) } func unescapeUnicodeString(some string) (string, error) { if some == "" { return "", nil } buf := []byte(some) buflen := len(buf) var i int var newBytes []byte var escaped bool // Loop through bytes following initial double quote, but note we will // return immediately when find unescaped double quote. for i = 0; i < buflen; i++ { b := buf[i] if escaped { escaped = false if b == 'u' { // NOTE: Need at least 4 more bytes to read uint16, but subtract // 1 because do not want to count the trailing quote and // subtract another 1 because already consumed u but have yet to // increment i. if i > buflen-6 { return "", fmt.Errorf("cannot replace escaped characters with UTF-8 equivalent: %s", io.ErrShortBuffer) } v, err := parseUint64FromHexSlice(buf[i+1 : i+5]) if err != nil { return "", fmt.Errorf("cannot replace escaped characters with UTF-8 equivalent: %s", err) } i += 4 // absorb 4 characters: one 'u' and three of the digits nbl := len(newBytes) newBytes = append(newBytes, []byte{0, 0, 0, 0}...) // grow to make room for UTF-8 encoded rune r := rune(v) if utf16.IsSurrogate(r) { i++ // absorb final hexadecimal digit from previous value // Expect second half of surrogate pair if i > buflen-6 || buf[i] != '\\' || buf[i+1] != 'u' { return "", errors.New("cannot replace escaped characters with UTF-8 equivalent: missing second half of surrogate pair") } v, err = parseUint64FromHexSlice(buf[i+2 : i+6]) if err != nil { return "", fmt.Errorf("cannot replace escaped characters with UTF-8 equivalents: %s", err) } i += 5 // absorb 5 characters: two for '\u', and 3 of the 4 digits // Get code point by combining high and low surrogate bits r = utf16.DecodeRune(r, rune(v)) } width := utf8.EncodeRune(newBytes[nbl:], r) // append UTF-8 encoded version of code point newBytes = newBytes[:nbl+width] // trim off excess bytes continue } newBytes = append(newBytes, b) continue } if b == '\\' { escaped = true continue } newBytes = append(newBytes, b) } if escaped { return "", fmt.Errorf("cannot replace escaped characters with UTF-8 equivalents: %s", io.ErrShortBuffer) } return string(newBytes), nil } func parseUint64FromHexSlice(buf []byte) (uint64, error) { var value uint64 for _, b := range buf { diff := uint64(b - '0') if diff < 10 { value = (value << 4) | diff continue } b10 := b + 10 diff = uint64(b10 - 'A') if diff < 10 { return 0, hex.InvalidByteError(b) } if diff < 16 { value = (value << 4) | diff continue } diff = uint64(b10 - 'a') if diff < 10 { return 0, hex.InvalidByteError(b) } if diff < 16 { value = (value << 4) | diff continue } return 0, hex.InvalidByteError(b) } return value, nil } func unescapeSpecialJSON(b byte) (byte, bool) { // NOTE: The following 8 special JSON characters must be escaped: switch b { case '"', '\\', '/': return b, true case 'b': return '\b', true case 'f': return '\f', true case 'n': return '\n', true case 'r': return '\r', true case 't': return '\t', true } return b, false } //////////////////////////////////////// // Text Encode //////////////////////////////////////// func bytesTextualFromNative(buf []byte, datum interface{}) ([]byte, error) { var someBytes []byte switch d := datum.(type) { case []byte: someBytes = d case string: someBytes = []byte(d) default: return nil, fmt.Errorf("cannot encode textual bytes: expected: []byte or string; received: %T", datum) } buf = append(buf, '"') // prefix buffer with double quote for _, b := range someBytes { if escaped, ok := escapeSpecialJSON(b); ok { buf = append(buf, escaped...) continue } if r := rune(b); r < utf8.RuneSelf && unicode.IsPrint(r) { buf = append(buf, b) continue } // This Code Point _could_ be encoded as a single byte, however, it's // above standard ASCII range (b > 127), therefore must encode using its // four-byte hexadecimal equivalent, which will always start with the // high byte 00 buf = appendUnicodeHex(buf, uint16(b)) } return append(buf, '"'), nil // postfix buffer with double quote } func stringTextualFromNative(buf []byte, datum interface{}) ([]byte, error) { var someString string switch d := datum.(type) { case []byte: someString = string(d) case string: someString = d default: return nil, fmt.Errorf("cannot encode textual string: expected: []byte or string; received: %T", datum) } buf = append(buf, '"') // prefix buffer with double quote for _, r := range someString { if r < utf8.RuneSelf { if escaped, ok := escapeSpecialJSON(byte(r)); ok { buf = append(buf, escaped...) continue } if unicode.IsPrint(r) { buf = append(buf, byte(r)) continue } } // NOTE: Attempt to encode code point as UTF-16 surrogate pair r1, r2 := utf16.EncodeRune(r) if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar { // code point does require surrogate pair, and thus two uint16 values buf = appendUnicodeHex(buf, uint16(r1)) buf = appendUnicodeHex(buf, uint16(r2)) continue } // Code Point does not require surrogate pair. buf = appendUnicodeHex(buf, uint16(r)) } return append(buf, '"'), nil // postfix buffer with double quote } func appendUnicodeHex(buf []byte, v uint16) []byte { // Start with '\u' prefix: buf = append(buf, sliceUnicode...) // And tack on 4 hexadecimal digits: buf = append(buf, hexDigits[(v&0xF000)>>12]) buf = append(buf, hexDigits[(v&0xF00)>>8]) buf = append(buf, hexDigits[(v&0xF0)>>4]) buf = append(buf, hexDigits[(v&0xF)]) return buf } const hexDigits = "0123456789ABCDEF" func escapeSpecialJSON(b byte) ([]byte, bool) { // NOTE: The following 8 special JSON characters must be escaped: switch b { case '"': return sliceQuote, true case '\\': return sliceBackslash, true case '/': return sliceSlash, true case '\b': return sliceBackspace, true case '\f': return sliceFormfeed, true case '\n': return sliceNewline, true case '\r': return sliceCarriageReturn, true case '\t': return sliceTab, true } return nil, false } // While slices in Go are never constants, we can initialize them once and reuse // them many times. We define these slices at library load time and reuse them // when encoding JSON. var ( sliceQuote = []byte("\\\"") sliceBackslash = []byte("\\\\") sliceSlash = []byte("\\/") sliceBackspace = []byte("\\b") sliceFormfeed = []byte("\\f") sliceNewline = []byte("\\n") sliceCarriageReturn = []byte("\\r") sliceTab = []byte("\\t") sliceUnicode = []byte("\\u") ) // DEBUG -- remove function prior to committing func decodedStringFromJSON(buf []byte) (string, []byte, error) { fmt.Fprintf(os.Stderr, "decodedStringFromJSON(%v)\n", buf) buflen := len(buf) if buflen < 2 { return "", buf, fmt.Errorf("cannot decode string: %s", io.ErrShortBuffer) } if buf[0] != '"' { return "", buf, fmt.Errorf("cannot decode string: expected initial '\"'; found: %#U", buf[0]) } var newBytes []byte var escaped, ok bool // Loop through bytes following initial double quote, but note we will // return immediately when find unescaped double quote. for i := 1; i < buflen; i++ { b := buf[i] if escaped { escaped = false if b, ok = unescapeSpecialJSON(b); ok { newBytes = append(newBytes, b) continue } if b == 'u' { // NOTE: Need at least 4 more bytes to read uint16, but subtract // 1 because do not want to count the trailing quote and // subtract another 1 because already consumed u but have yet to // increment i. if i > buflen-6 { return "", buf[i+1:], fmt.Errorf("cannot decode string: %s", io.ErrShortBuffer) } v, err := parseUint64FromHexSlice(buf[i+1 : i+5]) if err != nil { return "", buf[i+1:], fmt.Errorf("cannot decode string: %s", err) } i += 4 // absorb 4 characters: one 'u' and three of the digits nbl := len(newBytes) newBytes = append(newBytes, 0, 0, 0, 0) // grow to make room for UTF-8 encoded rune r := rune(v) if utf16.IsSurrogate(r) { i++ // absorb final hexidecimal digit from previous value // Expect second half of surrogate pair if i > buflen-6 || buf[i] != '\\' || buf[i+1] != 'u' { return "", buf[i+1:], errors.New("cannot decode string: missing second half of surrogate pair") } v, err = parseUint64FromHexSlice(buf[i+2 : i+6]) if err != nil { return "", buf[i+1:], fmt.Errorf("cannot decode string: cannot decode second half of surrogate pair: %s", err) } i += 5 // absorb 5 characters: two for '\u', and 3 of the 4 digits // Get code point by combining high and low surrogate bits r = utf16.DecodeRune(r, rune(v)) } width := utf8.EncodeRune(newBytes[nbl:], r) // append UTF-8 encoded version of code point newBytes = newBytes[:nbl+width] // trim off excess bytes continue } newBytes = append(newBytes, b) continue } if b == '\\' { escaped = true continue } if b == '"' { return string(newBytes), buf[i+1:], nil } newBytes = append(newBytes, b) } return "", buf, fmt.Errorf("cannot decode string: expected final '\"'; found: %#U", buf[buflen-1]) }