197 lines
5.5 KiB
Go

// Copyright 2015 Garrett D'Amore
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use file except in compliance with the License.
// You may obtain a copy of the license at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package encoding
import (
"sync"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/transform"
)
const (
// RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'.
RuneError = '\uFFFD'
// RuneSelf is the rune below which UTF-8 and the Unicode values are
// identical. Its also the limit for ASCII.
RuneSelf = 0x80
// ASCIISub is the ASCII substitution character.
ASCIISub = '\x1a'
)
// Charmap is a structure for setting up encodings for 8-bit character sets,
// for transforming between UTF8 and that other character set. It has some
// ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a
// different implementation. This implementation uses maps, and supports
// user-defined maps.
//
// We do assume that a character map has a reasonable substitution character,
// and that valid encodings are stable (exactly a 1:1 map) and stateless
// (that is there is no shift character or anything like that.) Hence this
// approach will not work for many East Asian character sets.
//
// Measurement shows little or no measurable difference in the performance of
// the two approaches. The difference was down to a couple of nsec/op, and
// no consistent pattern as to which ran faster. With the conversion to
// UTF-8 the code takes about 25 nsec/op. The conversion in the reverse
// direction takes about 100 nsec/op. (The larger cost for conversion
// from UTF-8 is most likely due to the need to convert the UTF-8 byte stream
// to a rune before conversion.
//
type Charmap struct {
transform.NopResetter
bytes map[rune]byte
runes [256][]byte
once sync.Once
// The map between bytes and runes. To indicate that a specific
// byte value is invalid for a charcter set, use the rune
// utf8.RuneError. Values that are absent from this map will
// be assumed to have the identity mapping -- that is the default
// is to assume ISO8859-1, where all 8-bit characters have the same
// numeric value as their Unicode runes. (Not to be confused with
// the UTF-8 values, which *will* be different for non-ASCII runes.)
//
// If no values less than RuneSelf are changed (or have non-identity
// mappings), then the character set is assumed to be an ASCII
// superset, and certain assumptions and optimizations become
// available for ASCII bytes.
Map map[byte]rune
// The ReplacementChar is the byte value to use for substitution.
// It should normally be ASCIISub for ASCII encodings. This may be
// unset (left to zero) for mappings that are strictly ASCII supersets.
// In that case ASCIISub will be assumed instead.
ReplacementChar byte
}
type cmapDecoder struct {
transform.NopResetter
runes [256][]byte
}
type cmapEncoder struct {
transform.NopResetter
bytes map[rune]byte
replace byte
}
// Init initializes internal values of a character map. This should
// be done early, to minimize the cost of allocation of transforms
// later. It is not strictly necessary however, as the allocation
// functions will arrange to call it if it has not already been done.
func (c *Charmap) Init() {
c.once.Do(c.initialize)
}
func (c *Charmap) initialize() {
c.bytes = make(map[rune]byte)
ascii := true
for i := 0; i < 256; i++ {
r, ok := c.Map[byte(i)]
if !ok {
r = rune(i)
}
if r < 128 && r != rune(i) {
ascii = false
}
if r != RuneError {
c.bytes[r] = byte(i)
}
utf := make([]byte, utf8.RuneLen(r))
utf8.EncodeRune(utf, r)
c.runes[i] = utf
}
if ascii && c.ReplacementChar == '\x00' {
c.ReplacementChar = ASCIISub
}
}
// NewDecoder returns a Decoder the converts from the 8-bit
// character set to UTF-8. Unknown mappings, if any, are mapped
// to '\uFFFD'.
func (c *Charmap) NewDecoder() *encoding.Decoder {
c.Init()
return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}}
}
// NewEncoder returns a Transformer that converts from UTF8 to the
// 8-bit character set. Unknown mappings are mapped to 0x1A.
func (c *Charmap) NewEncoder() *encoding.Encoder {
c.Init()
return &encoding.Encoder{
Transformer: &cmapEncoder{
bytes: c.bytes,
replace: c.ReplacementChar,
},
}
}
func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
var e error
var ndst, nsrc int
for _, c := range src {
b := d.runes[c]
l := len(b)
if ndst+l > len(dst) {
e = transform.ErrShortDst
break
}
for i := 0; i < l; i++ {
dst[ndst] = b[i]
ndst++
}
nsrc++
}
return ndst, nsrc, e
}
func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
var e error
var ndst, nsrc int
for nsrc < len(src) {
if ndst >= len(dst) {
e = transform.ErrShortDst
break
}
r, sz := utf8.DecodeRune(src[nsrc:])
if r == utf8.RuneError && sz == 1 {
// If its inconclusive due to insufficient data in
// in the source, report it
if !atEOF && !utf8.FullRune(src[nsrc:]) {
e = transform.ErrShortSrc
break
}
}
if c, ok := d.bytes[r]; ok {
dst[ndst] = c
} else {
dst[ndst] = d.replace
}
nsrc += sz
ndst++
}
return ndst, nsrc, e
}