diff --git a/encodings/dbcs-codec.js b/encodings/dbcs-codec.js index 3294ed9..456c8ac 100644 --- a/encodings/dbcs-codec.js +++ b/encodings/dbcs-codec.js @@ -1,12 +1,9 @@ "use strict"; -var Buffer = require("safer-buffer").Buffer; // Multibyte codec. In this scheme, a character is represented by 1 or more bytes. // Our codec supports UTF-16 surrogates, extensions for GB18030 and unicode sequences. // To save memory and loading time, we read table files only when requested. -exports._dbcs = DBCSCodec; - const UNASSIGNED = -1, GB18030_CODE = -2, SEQ_START = -10, @@ -21,588 +18,603 @@ for (let i = 0; i < 0x100; i++) { } // Class DBCSCodec reads and initializes mapping tables. -function DBCSCodec(codecOptions, iconv) { - this.encodingName = codecOptions.encodingName; - if (!codecOptions) throw new Error("DBCS codec is called without the data."); - if (!codecOptions.table) throw new Error("Encoding '" + this.encodingName + "' has no data."); - - // Load tables. - const mappingTable = codecOptions.table(); - - // Decode tables: MBCS -> Unicode. - - // decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256. - // Trie root is decodeTables[0]. - // Values: >= 0 -> unicode character code. can be > 0xFFFF - // == UNASSIGNED -> unknown/unassigned sequence. - // == GB18030_CODE -> this is the end of a GB18030 4-byte sequence. - // <= NODE_START -> index of the next node in our trie to process next byte. - // <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq. - this.decodeTables = []; - this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. - - // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. - this.decodeTableSeq = []; - - // Actual mapping tables consist of chunks. Use them to fill up decode tables. - for (let i = 0; i < mappingTable.length; i++) this._addDecodeChunk(mappingTable[i]); - - // Load & create GB18030 tables when needed. - if (typeof codecOptions.gb18030 === "function") { - this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges. - - // Add GB18030 common decode nodes. - const commonThirdByteNodeIdx = this.decodeTables.length; - this.decodeTables.push(UNASSIGNED_NODE.slice(0)); - - const commonFourthByteNodeIdx = this.decodeTables.length; - this.decodeTables.push(UNASSIGNED_NODE.slice(0)); - - // Fill out the tree - const firstByteNode = this.decodeTables[0]; - for (let i = 0x81; i <= 0xfe; i++) { - const secondNodeIdx = NODE_START - firstByteNode[i]; - const secondByteNode = this.decodeTables[secondNodeIdx]; - for (let j = 0x30; j <= 0x39; j++) { - if (secondByteNode[j] === UNASSIGNED) { - secondByteNode[j] = NODE_START - commonThirdByteNodeIdx; - } else if (secondByteNode[j] > NODE_START) { - throw new Error("gb18030 decode tables conflict at byte 2"); - } - - const thirdNodeIdx = NODE_START - secondByteNode[j]; - const thirdByteNode = this.decodeTables[thirdNodeIdx]; - for (let k = 0x81; k <= 0xfe; k++) { - const commonFourthNodeIdx = NODE_START - commonFourthByteNodeIdx; - if (thirdByteNode[k] === UNASSIGNED) { - thirdByteNode[k] = commonFourthNodeIdx; - } else if (thirdByteNode[k] === commonFourthNodeIdx) { - continue; - } else if (thirdByteNode[k] > NODE_START) { - throw new Error("gb18030 decode tables conflict at byte 3"); +exports._dbcs = class DBCSCodec { + constructor(codecOptions, iconv) { + this.encodingName = codecOptions.encodingName; + if (!codecOptions) throw new Error("DBCS codec is called without the data."); + if (!codecOptions.table) + throw new Error("Encoding '" + this.encodingName + "' has no data."); + + // Load tables. + const mappingTable = codecOptions.table(); + + // Decode tables: MBCS -> Unicode. + + // decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256. + // Trie root is decodeTables[0]. + // Values: >= 0 -> unicode character code. can be > 0xFFFF + // == UNASSIGNED -> unknown/unassigned sequence. + // == GB18030_CODE -> this is the end of a GB18030 4-byte sequence. + // <= NODE_START -> index of the next node in our trie to process next byte. + // <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq. + this.decodeTables = []; + this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. + + // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. + this.decodeTableSeq = []; + + // Actual mapping tables consist of chunks. Use them to fill up decode tables. + for (let i = 0; i < mappingTable.length; i++) this._addDecodeChunk(mappingTable[i]); + + // Load & create GB18030 tables when needed. + if (typeof codecOptions.gb18030 === "function") { + this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges. + + // Add GB18030 common decode nodes. + const commonThirdByteNodeIdx = this.decodeTables.length; + this.decodeTables.push(UNASSIGNED_NODE.slice(0)); + + const commonFourthByteNodeIdx = this.decodeTables.length; + this.decodeTables.push(UNASSIGNED_NODE.slice(0)); + + // Fill out the tree + const firstByteNode = this.decodeTables[0]; + for (let i = 0x81; i <= 0xfe; i++) { + const secondNodeIdx = NODE_START - firstByteNode[i]; + const secondByteNode = this.decodeTables[secondNodeIdx]; + for (let j = 0x30; j <= 0x39; j++) { + if (secondByteNode[j] === UNASSIGNED) { + secondByteNode[j] = NODE_START - commonThirdByteNodeIdx; + } else if (secondByteNode[j] > NODE_START) { + throw new Error("gb18030 decode tables conflict at byte 2"); } - const fourthNodeIdx = NODE_START - thirdByteNode[k]; - const fourthByteNode = this.decodeTables[fourthNodeIdx]; - for (let l = 0x30; l <= 0x39; l++) { - if (fourthByteNode[l] === UNASSIGNED) fourthByteNode[l] = GB18030_CODE; + const thirdNodeIdx = NODE_START - secondByteNode[j]; + const thirdByteNode = this.decodeTables[thirdNodeIdx]; + for (let k = 0x81; k <= 0xfe; k++) { + const commonFourthNodeIdx = NODE_START - commonFourthByteNodeIdx; + if (thirdByteNode[k] === UNASSIGNED) { + thirdByteNode[k] = commonFourthNodeIdx; + } else if (thirdByteNode[k] === commonFourthNodeIdx) { + continue; + } else if (thirdByteNode[k] > NODE_START) { + throw new Error("gb18030 decode tables conflict at byte 3"); + } + + const fourthNodeIdx = NODE_START - thirdByteNode[k]; + const fourthByteNode = this.decodeTables[fourthNodeIdx]; + for (let l = 0x30; l <= 0x39; l++) { + if (fourthByteNode[l] === UNASSIGNED) fourthByteNode[l] = GB18030_CODE; + } } } } } - } - this.defaultCharUnicode = iconv.defaultCharUnicode; - - // Encode tables: Unicode -> DBCS. - - // `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. - // Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null. - // Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.). - // == UNASSIGNED -> no conversion found. Output a default char. - // <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. - this.encodeTable = []; - - // `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of - // objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key - // means end of sequence (needed when one sequence is a strict subsequence of another). - // Objects are kept separately from encodeTable to increase performance. - this.encodeTableSeq = []; - - // Some chars can be decoded, but need not be encoded. - const skipEncodeChars = {}; - if (codecOptions.encodeSkipVals) - for (let i = 0; i < codecOptions.encodeSkipVals.length; i++) { - const val = codecOptions.encodeSkipVals[i]; - if (typeof val === "number") { - skipEncodeChars[val] = true; - } else { - for (let j = val.from; j <= val.to; j++) skipEncodeChars[j] = true; + this.defaultCharUnicode = iconv.defaultCharUnicode; + + // Encode tables: Unicode -> DBCS. + + // `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. + // Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null. + // Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.). + // == UNASSIGNED -> no conversion found. Output a default char. + // <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. + this.encodeTable = []; + + // `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of + // objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key + // means end of sequence (needed when one sequence is a strict subsequence of another). + // Objects are kept separately from encodeTable to increase performance. + this.encodeTableSeq = []; + + // Some chars can be decoded, but need not be encoded. + const skipEncodeChars = {}; + if (codecOptions.encodeSkipVals) + for (let i = 0; i < codecOptions.encodeSkipVals.length; i++) { + const val = codecOptions.encodeSkipVals[i]; + if (typeof val === "number") { + skipEncodeChars[val] = true; + } else { + for (let j = val.from; j <= val.to; j++) skipEncodeChars[j] = true; + } } - } - // Use decode trie to recursively fill out encode tables. - this._fillEncodeTable(0, 0, skipEncodeChars); + // Use decode trie to recursively fill out encode tables. + this._fillEncodeTable(0, 0, skipEncodeChars); + + // Add more encoding pairs when needed. + if (codecOptions.encodeAdd) { + for (const uChar in codecOptions.encodeAdd) { + if (hasOwnProperty.call(codecOptions.encodeAdd, uChar)) + this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]); + } + } - // Add more encoding pairs when needed. - if (codecOptions.encodeAdd) { - for (const uChar in codecOptions.encodeAdd) { - if (hasOwnProperty.call(codecOptions.encodeAdd, uChar)) - this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]); + this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)]; + if (this.defCharSB === UNASSIGNED) { + this.defCharSB = this.encodeTable[0]["?"]; + } + if (this.defCharSB === UNASSIGNED) { + this.defCharSB = "?".charCodeAt(0); } } - this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)]; - if (this.defCharSB === UNASSIGNED) { - this.defCharSB = this.encodeTable[0]["?"]; + get decoder() { + return DBCSDecoder; } - if (this.defCharSB === UNASSIGNED) { - this.defCharSB = "?".charCodeAt(0); + + get encoder() { + return DBCSEncoder; } -} -DBCSCodec.prototype.encoder = DBCSEncoder; -DBCSCodec.prototype.decoder = DBCSDecoder; - -// Decoder helpers -DBCSCodec.prototype._getDecodeTrieNode = function (addr) { - const bytes = []; - for (; addr > 0; addr >>>= 8) bytes.push(addr & 0xff); - if (bytes.length === 0) bytes.push(0); - - let node = this.decodeTables[0]; - for (let i = bytes.length - 1; i > 0; i--) { - // Traverse nodes deeper into the trie. - const val = node[bytes[i]]; - - if (val === UNASSIGNED) { - // Create new node. - node[bytes[i]] = NODE_START - this.decodeTables.length; - this.decodeTables.push((node = UNASSIGNED_NODE.slice(0))); - } else if (val <= NODE_START) { - // Existing node. - node = this.decodeTables[NODE_START - val]; - } else { - const hexAddr = addr.toString(16); - throw new Error(`Overwrite byte in ${this.encodingName}, addr: ${hexAddr}`); + _getDecodeTrieNode(addr) { + const bytes = []; + for (; addr > 0; addr >>>= 8) bytes.push(addr & 0xff); + if (bytes.length === 0) bytes.push(0); + + let node = this.decodeTables[0]; + for (let i = bytes.length - 1; i > 0; i--) { + // Traverse nodes deeper into the trie. + const val = node[bytes[i]]; + + if (val === UNASSIGNED) { + // Create new node. + node[bytes[i]] = NODE_START - this.decodeTables.length; + this.decodeTables.push((node = UNASSIGNED_NODE.slice(0))); + } else if (val <= NODE_START) { + // Existing node. + node = this.decodeTables[NODE_START - val]; + } else { + const hexAddr = addr.toString(16); + throw new Error(`Overwrite byte in ${this.encodingName}, addr: ${hexAddr}`); + } } + return node; } - return node; -}; -DBCSCodec.prototype._addDecodeChunk = function (chunk) { - // First element of chunk is the hex mbcs code where we start. - let curAddr = parseInt(chunk[0], 16); - - // Choose the decoding node where we'll write our chars. - const writeTable = this._getDecodeTrieNode(curAddr); - curAddr = curAddr & 0xff; - - // Write all other elements of the chunk to the table. - for (let k = 1; k < chunk.length; k++) { - const part = chunk[k]; - if (typeof part === "string") { - // String, write as-is. - for (let l = 0; l < part.length; ) { - const code = part.charCodeAt(l++); - if (0xd800 <= code && code < 0xdc00) { - // Decode surrogate - const codeTrail = part.charCodeAt(l++); - if (0xdc00 <= codeTrail && codeTrail < 0xe000) { - writeTable[curAddr++] = - 0x10000 + (code - 0xd800) * 0x400 + (codeTrail - 0xdc00); + _addDecodeChunk(chunk) { + // First element of chunk is the hex mbcs code where we start. + let curAddr = parseInt(chunk[0], 16); + + // Choose the decoding node where we'll write our chars. + const writeTable = this._getDecodeTrieNode(curAddr); + curAddr = curAddr & 0xff; + + // Write all other elements of the chunk to the table. + for (let k = 1; k < chunk.length; k++) { + const part = chunk[k]; + if (typeof part === "string") { + // String, write as-is. + for (let l = 0; l < part.length; ) { + const code = part.charCodeAt(l++); + if (0xd800 <= code && code < 0xdc00) { + // Decode surrogate + const codeTrail = part.charCodeAt(l++); + if (0xdc00 <= codeTrail && codeTrail < 0xe000) { + writeTable[curAddr++] = + 0x10000 + (code - 0xd800) * 0x400 + (codeTrail - 0xdc00); + } else { + throw new Error( + `Incorrect surrogate pair in ${this.encodingName} at chunk ${chunk[0]}` + ); + } + } else if (0x0ff0 < code && code <= 0x0fff) { + // Character sequence (our own encoding used) + const len = 0xfff - code + 2; + const seq = []; + for (let m = 0; m < len; m++) { + // Simple variation: don't support surrogates or subsequences in seq. + seq.push(part.charCodeAt(l++)); + } + + writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length; + this.decodeTableSeq.push(seq); } else { - throw new Error( - `Incorrect surrogate pair in ${this.encodingName} at chunk ${chunk[0]}` - ); + writeTable[curAddr++] = code; // Basic char } - } else if (0x0ff0 < code && code <= 0x0fff) { - // Character sequence (our own encoding used) - const len = 0xfff - code + 2; - const seq = []; - for (let m = 0; m < len; m++) { - // Simple variation: don't support surrogates or subsequences in seq. - seq.push(part.charCodeAt(l++)); - } - - writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length; - this.decodeTableSeq.push(seq); - } else { - writeTable[curAddr++] = code; // Basic char } - } - } else if (typeof part === "number") { - // Integer, meaning increasing sequence starting with prev character. - let charCode = writeTable[curAddr - 1] + 1; - for (let l = 0; l < part; l++) { - writeTable[curAddr++] = charCode++; - } - } else + } else if (typeof part === "number") { + // Integer, meaning increasing sequence starting with prev character. + let charCode = writeTable[curAddr - 1] + 1; + for (let l = 0; l < part; l++) { + writeTable[curAddr++] = charCode++; + } + } else + throw new Error( + `Incorrect type '${typeof part}' given in ${this.encodingName} at chunk ${ + chunk[0] + }` + ); + } + if (curAddr > 0xff) throw new Error( - `Incorrect type '${typeof part}' given in ${this.encodingName} at chunk ${chunk[0]}` + `Incorrect chunk in ${this.encodingName} at addr ${chunk[0]}: too long ${curAddr}` ); } - if (curAddr > 0xff) - throw new Error( - `Incorrect chunk in ${this.encodingName} at addr ${chunk[0]}: too long ${curAddr}` - ); -}; - -// Encoder helpers -DBCSCodec.prototype._getEncodeBucket = function (uCode) { - const high = uCode >> 8; // This could be > 0xFF because of astral characters. - if (this.encodeTable[high] === undefined) this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand. - return this.encodeTable[high]; -}; -DBCSCodec.prototype._setEncodeChar = function (uCode, dbcsCode) { - const bucket = this._getEncodeBucket(uCode); - const low = uCode & 0xff; - if (bucket[low] <= SEQ_START) { - // There's already a sequence, set a single-char subsequence of it. - this.encodeTableSeq[SEQ_START - bucket[low]][DEF_CHAR] = dbcsCode; - } else if (bucket[low] === UNASSIGNED) { - bucket[low] = dbcsCode; + _getEncodeBucket(uCode) { + const high = uCode >> 8; // This could be > 0xFF because of astral characters. + if (this.encodeTable[high] === undefined) this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand. + return this.encodeTable[high]; } -}; -DBCSCodec.prototype._setEncodeSequence = function (seq, dbcsCode) { - // Get the root of character tree according to first character of the sequence. - const uCode = seq[0]; - const bucket = this._getEncodeBucket(uCode); - const low = uCode & 0xff; - - let node; - if (bucket[low] <= SEQ_START) { - // There's already a sequence with - use it. - node = this.encodeTableSeq[SEQ_START - bucket[low]]; - } else { - // There was no sequence object - allocate a new one. - node = {}; - if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence. - bucket[low] = SEQ_START - this.encodeTableSeq.length; - this.encodeTableSeq.push(node); + _setEncodeChar(uCode, dbcsCode) { + const bucket = this._getEncodeBucket(uCode); + const low = uCode & 0xff; + if (bucket[low] <= SEQ_START) { + // There's already a sequence, set a single-char subsequence of it. + this.encodeTableSeq[SEQ_START - bucket[low]][DEF_CHAR] = dbcsCode; + } else if (bucket[low] === UNASSIGNED) { + bucket[low] = dbcsCode; + } } - // Traverse the character tree, allocating new nodes as needed. - for (let j = 1; j < seq.length - 1; j++) { - const oldVal = node[uCode]; - if (typeof oldVal === "object") { - node = oldVal; + _setEncodeSequence(seq, dbcsCode) { + // Get the root of character tree according to first character of the sequence. + const uCode = seq[0]; + const bucket = this._getEncodeBucket(uCode); + const low = uCode & 0xff; + + let node; + if (bucket[low] <= SEQ_START) { + // There's already a sequence with - use it. + node = this.encodeTableSeq[SEQ_START - bucket[low]]; } else { - node = node[uCode] = {}; - if (oldVal !== undefined) node[DEF_CHAR] = oldVal; + // There was no sequence object - allocate a new one. + node = {}; + if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence. + bucket[low] = SEQ_START - this.encodeTableSeq.length; + this.encodeTableSeq.push(node); } - } - // Set the leaf to given dbcsCode. - const uCode2 = seq[seq.length - 1]; - node[uCode2] = dbcsCode; -}; + // Traverse the character tree, allocating new nodes as needed. + for (let j = 1; j < seq.length - 1; j++) { + const oldVal = node[uCode]; + if (typeof oldVal === "object") { + node = oldVal; + } else { + node = node[uCode] = {}; + if (oldVal !== undefined) node[DEF_CHAR] = oldVal; + } + } -DBCSCodec.prototype._fillEncodeTable = function (nodeIdx, prefix, skipEncodeChars) { - const node = this.decodeTables[nodeIdx]; - let hasValues = false; - const subNodeEmpty = {}; - for (let i = 0; i < 0x100; i++) { - const uCode = node[i]; - const mbCode = prefix + i; - if (skipEncodeChars[mbCode]) continue; - - if (uCode >= 0) { - this._setEncodeChar(uCode, mbCode); - hasValues = true; - } else if (uCode <= NODE_START) { - const subNodeIdx = NODE_START - uCode; - if (!subNodeEmpty[subNodeIdx]) { - // Skip empty subtrees (they are too large in gb18030). - var newPrefix = (mbCode << 8) >>> 0; // NOTE: '>>> 0' keeps 32-bit num positive. - if (this._fillEncodeTable(subNodeIdx, newPrefix, skipEncodeChars)) { - hasValues = true; - } else { - subNodeEmpty[subNodeIdx] = true; + // Set the leaf to given dbcsCode. + const uCode2 = seq[seq.length - 1]; + node[uCode2] = dbcsCode; + } + + _fillEncodeTable(nodeIdx, prefix, skipEncodeChars) { + const node = this.decodeTables[nodeIdx]; + let hasValues = false; + const subNodeEmpty = {}; + for (let i = 0; i < 0x100; i++) { + const uCode = node[i]; + const mbCode = prefix + i; + if (skipEncodeChars[mbCode]) continue; + + if (uCode >= 0) { + this._setEncodeChar(uCode, mbCode); + hasValues = true; + } else if (uCode <= NODE_START) { + const subNodeIdx = NODE_START - uCode; + if (!subNodeEmpty[subNodeIdx]) { + // Skip empty subtrees (they are too large in gb18030). + var newPrefix = (mbCode << 8) >>> 0; // NOTE: '>>> 0' keeps 32-bit num positive. + if (this._fillEncodeTable(subNodeIdx, newPrefix, skipEncodeChars)) { + hasValues = true; + } else { + subNodeEmpty[subNodeIdx] = true; + } } + } else if (uCode <= SEQ_START) { + this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode); + hasValues = true; } - } else if (uCode <= SEQ_START) { - this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode); - hasValues = true; } + return hasValues; } - return hasValues; }; // == Encoder ================================================================== -function DBCSEncoder(options, codec) { - // Encoder state - this.leadSurrogate = -1; - this.seqObj = undefined; - - // Static data - this.encodeTable = codec.encodeTable; - this.encodeTableSeq = codec.encodeTableSeq; - this.defaultCharSingleByte = codec.defCharSB; - this.gb18030 = codec.gb18030; -} +class DBCSEncoder { + constructor(options, codec, backend) { + this.backend = backend; + // Encoder state + this.leadSurrogate = -1; + this.seqObj = undefined; -DBCSEncoder.prototype.write = function (str) { - const newBuf = Buffer.alloc(str.length * (this.gb18030 ? 4 : 3)); - let leadSurrogate = this.leadSurrogate, - seqObj = this.seqObj, - nextChar = -1, - i = 0, - j = 0; - - for (;;) { - // 0. Get next character. - let uCode; - if (nextChar === -1) { - if (i === str.length) break; - uCode = str.charCodeAt(i++); - } else { - uCode = nextChar; - nextChar = -1; - } + // Static data + this.encodeTable = codec.encodeTable; + this.encodeTableSeq = codec.encodeTableSeq; + this.defaultCharSingleByte = codec.defCharSB; + this.gb18030 = codec.gb18030; + } - // 1. Handle surrogates. - if (0xd800 <= uCode && uCode < 0xe000) { - // Char is one of surrogates. - if (uCode < 0xdc00) { - // We've got a lead surrogate. - if (leadSurrogate === -1) { - leadSurrogate = uCode; - continue; - } else { - leadSurrogate = uCode; - // Double lead surrogate found. - uCode = UNASSIGNED; - } + write(str) { + const bytes = this.backend.allocBytes(str.length * (this.gb18030 ? 4 : 3)); + let leadSurrogate = this.leadSurrogate, + seqObj = this.seqObj, + nextChar = -1, + i = 0, + bytePos = 0; + + for (;;) { + // 0. Get next character. + let uCode; + if (nextChar === -1) { + if (i === str.length) break; + uCode = str.charCodeAt(i++); } else { - // We've got trail surrogate. - if (leadSurrogate !== -1) { - uCode = 0x10000 + (leadSurrogate - 0xd800) * 0x400 + (uCode - 0xdc00); - leadSurrogate = -1; - } else { - // Incomplete surrogate pair - only trail surrogate found. - uCode = UNASSIGNED; - } + uCode = nextChar; + nextChar = -1; } - } else if (leadSurrogate !== -1) { - // Incomplete surrogate pair - only lead surrogate found. - nextChar = uCode; - uCode = UNASSIGNED; // Write an error, then current char. - leadSurrogate = -1; - } - // 2. Convert uCode character. - let dbcsCode = UNASSIGNED; - if (seqObj !== undefined && uCode !== UNASSIGNED) { - // We are in the middle of the sequence - let resCode = seqObj[uCode]; - if (typeof resCode === "object") { - // Sequence continues. - seqObj = resCode; - continue; - } else if (typeof resCode == "number") { - // Sequence finished. Write it. - dbcsCode = resCode; - } else if (resCode === undefined) { - // Current character is not part of the sequence. - - // Try default character for this sequence - resCode = seqObj[DEF_CHAR]; - if (resCode !== undefined) { - dbcsCode = resCode; // Found. Write it. - nextChar = uCode; // Current character will be written too in the next iteration. + // 1. Handle surrogates. + if (0xd800 <= uCode && uCode < 0xe000) { + // Char is one of surrogates. + if (uCode < 0xdc00) { + // We've got a lead surrogate. + if (leadSurrogate === -1) { + leadSurrogate = uCode; + continue; + } else { + leadSurrogate = uCode; + // Double lead surrogate found. + uCode = UNASSIGNED; + } } else { - // TODO: What if we have no default? (resCode == undefined) - // Then, we should write first char of the sequence as-is and try the rest recursively. - // Didn't do it for now because no encoding has this situation yet. - // Currently, just skip the sequence and write current char. + // We've got trail surrogate. + if (leadSurrogate !== -1) { + uCode = 0x10000 + (leadSurrogate - 0xd800) * 0x400 + (uCode - 0xdc00); + leadSurrogate = -1; + } else { + // Incomplete surrogate pair - only trail surrogate found. + uCode = UNASSIGNED; + } } - } - seqObj = undefined; - } else if (uCode >= 0) { - // Regular character - const subtable = this.encodeTable[uCode >> 8]; - if (subtable !== undefined) dbcsCode = subtable[uCode & 0xff]; - - if (dbcsCode <= SEQ_START) { - // Sequence start - seqObj = this.encodeTableSeq[SEQ_START - dbcsCode]; - continue; + } else if (leadSurrogate !== -1) { + // Incomplete surrogate pair - only lead surrogate found. + nextChar = uCode; + uCode = UNASSIGNED; // Write an error, then current char. + leadSurrogate = -1; } - if (dbcsCode === UNASSIGNED && this.gb18030) { - // Use GB18030 algorithm to find character(s) to write. - const idx = findIdx(this.gb18030.uChars, uCode); - if (idx !== -1) { - dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); - newBuf[j++] = 0x81 + Math.floor(dbcsCode / 12600); - dbcsCode = dbcsCode % 12600; - newBuf[j++] = 0x30 + Math.floor(dbcsCode / 1260); - dbcsCode = dbcsCode % 1260; - newBuf[j++] = 0x81 + Math.floor(dbcsCode / 10); - dbcsCode = dbcsCode % 10; - newBuf[j++] = 0x30 + dbcsCode; + // 2. Convert uCode character. + let dbcsCode = UNASSIGNED; + if (seqObj !== undefined && uCode !== UNASSIGNED) { + // We are in the middle of the sequence + let resCode = seqObj[uCode]; + if (typeof resCode === "object") { + // Sequence continues. + seqObj = resCode; continue; + } else if (typeof resCode == "number") { + // Sequence finished. Write it. + dbcsCode = resCode; + } else if (resCode === undefined) { + // Current character is not part of the sequence. + + // Try default character for this sequence + resCode = seqObj[DEF_CHAR]; + if (resCode !== undefined) { + dbcsCode = resCode; // Found. Write it. + nextChar = uCode; // Current character will be written too in the next iteration. + } else { + // TODO: What if we have no default? (resCode == undefined) + // Then, we should write first char of the sequence as-is and try the rest recursively. + // Didn't do it for now because no encoding has this situation yet. + // Currently, just skip the sequence and write current char. + } + } + seqObj = undefined; + } else if (uCode >= 0) { + // Regular character + const subtable = this.encodeTable[uCode >> 8]; + if (subtable !== undefined) dbcsCode = subtable[uCode & 0xff]; + + if (dbcsCode <= SEQ_START) { + // Sequence start + seqObj = this.encodeTableSeq[SEQ_START - dbcsCode]; + continue; + } + + if (dbcsCode === UNASSIGNED && this.gb18030) { + // Use GB18030 algorithm to find character(s) to write. + const idx = findIdx(this.gb18030.uChars, uCode); + if (idx !== -1) { + dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); + bytes[bytePos++] = 0x81 + Math.floor(dbcsCode / 12600); + dbcsCode = dbcsCode % 12600; + bytes[bytePos++] = 0x30 + Math.floor(dbcsCode / 1260); + dbcsCode = dbcsCode % 1260; + bytes[bytePos++] = 0x81 + Math.floor(dbcsCode / 10); + dbcsCode = dbcsCode % 10; + bytes[bytePos++] = 0x30 + dbcsCode; + continue; + } } } - } - // 3. Write dbcsCode character. - if (dbcsCode === UNASSIGNED) { - dbcsCode = this.defaultCharSingleByte; - } + // 3. Write dbcsCode character. + if (dbcsCode === UNASSIGNED) { + dbcsCode = this.defaultCharSingleByte; + } - if (dbcsCode < 0x100) { - newBuf[j++] = dbcsCode; - } else if (dbcsCode < 0x10000) { - newBuf[j++] = dbcsCode >> 8; // high byte - newBuf[j++] = dbcsCode & 0xff; // low byte - } else if (dbcsCode < 0x1000000) { - newBuf[j++] = dbcsCode >> 16; - newBuf[j++] = (dbcsCode >> 8) & 0xff; - newBuf[j++] = dbcsCode & 0xff; - } else { - newBuf[j++] = dbcsCode >>> 24; - newBuf[j++] = (dbcsCode >>> 16) & 0xff; - newBuf[j++] = (dbcsCode >>> 8) & 0xff; - newBuf[j++] = dbcsCode & 0xff; + if (dbcsCode < 0x100) { + bytes[bytePos++] = dbcsCode; + } else if (dbcsCode < 0x10000) { + bytes[bytePos++] = dbcsCode >> 8; // high byte + bytes[bytePos++] = dbcsCode & 0xff; // low byte + } else if (dbcsCode < 0x1000000) { + bytes[bytePos++] = dbcsCode >> 16; + bytes[bytePos++] = (dbcsCode >> 8) & 0xff; + bytes[bytePos++] = dbcsCode & 0xff; + } else { + bytes[bytePos++] = dbcsCode >>> 24; + bytes[bytePos++] = (dbcsCode >>> 16) & 0xff; + bytes[bytePos++] = (dbcsCode >>> 8) & 0xff; + bytes[bytePos++] = dbcsCode & 0xff; + } } - } - this.seqObj = seqObj; - this.leadSurrogate = leadSurrogate; - return newBuf.slice(0, j); -}; - -DBCSEncoder.prototype.end = function () { - if (this.leadSurrogate === -1 && this.seqObj === undefined) { - return undefined; // All clean. Most often case. + this.seqObj = seqObj; + this.leadSurrogate = leadSurrogate; + return this.backend.bytesToResult(bytes, bytePos); } - const newBuf = Buffer.alloc(10); - let j = 0; + end() { + if (this.leadSurrogate === -1 && this.seqObj === undefined) { + return undefined; // All clean. Most often case. + } - if (this.seqObj) { - // We're in the sequence. - const dbcsCode = this.seqObj[DEF_CHAR]; - if (dbcsCode !== undefined) { - // Write beginning of the sequence. - if (dbcsCode < 0x100) { - newBuf[j++] = dbcsCode; + const bytes = this.backend.allocBytes(10); + let bytePos = 0; + + if (this.seqObj) { + // We're in the sequence. + const dbcsCode = this.seqObj[DEF_CHAR]; + if (dbcsCode !== undefined) { + // Write beginning of the sequence. + if (dbcsCode < 0x100) { + bytes[bytePos++] = dbcsCode; + } else { + bytes[bytePos++] = dbcsCode >> 8; // high byte + bytes[bytePos++] = dbcsCode & 0xff; // low byte + } } else { - newBuf[j++] = dbcsCode >> 8; // high byte - newBuf[j++] = dbcsCode & 0xff; // low byte + // See todo above. } - } else { - // See todo above. + this.seqObj = undefined; } - this.seqObj = undefined; - } - if (this.leadSurrogate !== -1) { - // Incomplete surrogate pair - only lead surrogate found. - newBuf[j++] = this.defaultCharSingleByte; - this.leadSurrogate = -1; - } + if (this.leadSurrogate !== -1) { + // Incomplete surrogate pair - only lead surrogate found. + bytes[bytePos++] = this.defaultCharSingleByte; + this.leadSurrogate = -1; + } - return newBuf.slice(0, j); -}; + return this.backend.bytesToResult(bytes, bytePos); + } -// Export for testing -DBCSEncoder.prototype.findIdx = findIdx; + // Export for testing + findIdx(table, val) { + return findIdx(table, val); + } +} // == Decoder ================================================================== -function DBCSDecoder(options, codec) { - // Decoder state - this.nodeIdx = 0; - this.prevBytes = []; +class DBCSDecoder { + constructor(options, codec, backend) { + this.backend = backend; - // Static data - this.decodeTables = codec.decodeTables; - this.decodeTableSeq = codec.decodeTableSeq; - this.defaultCharUnicode = codec.defaultCharUnicode; - this.gb18030 = codec.gb18030; -} + // Decoder state + this.nodeIdx = 0; + this.prevBytes = []; -DBCSDecoder.prototype.write = function (buf) { - const newBuf = Buffer.alloc(buf.length * 2), - prevBytes = this.prevBytes, - prevOffset = this.prevBytes.length; - - let nodeIdx = this.nodeIdx, - seqStart = -this.prevBytes.length, // idx of the start of current parsed sequence. - j = 0; - - for (let i = 0; i < buf.length; i++) { - const curByte = i >= 0 ? buf[i] : prevBytes[i + prevOffset]; - - // TODO: Check curByte is number 0 <= < 256 - - // Lookup in current trie node. - let uCode = this.decodeTables[nodeIdx][curByte]; - - if (uCode >= 0) { - // Normal character, just use it. - } else if (uCode === UNASSIGNED) { - // Unknown char. - // TODO: Callback with seq. - uCode = this.defaultCharUnicode.charCodeAt(0); - i = seqStart; // Skip one byte ('i' will be incremented by the for loop) and try to parse again. - } else if (uCode === GB18030_CODE) { - const b1 = i >= 3 ? buf[i - 3] : prevBytes[i - 3 + prevOffset]; - const b2 = i >= 2 ? buf[i - 2] : prevBytes[i - 2 + prevOffset]; - const b3 = i >= 1 ? buf[i - 1] : prevBytes[i - 1 + prevOffset]; - const ptr = - (b1 - 0x81) * 12600 + (b2 - 0x30) * 1260 + (b3 - 0x81) * 10 + (curByte - 0x30); - const idx = findIdx(this.gb18030.gbChars, ptr); - uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; - } else if (uCode <= NODE_START) { - // Go to next trie node. - nodeIdx = NODE_START - uCode; - continue; - } else if (uCode <= SEQ_START) { - // Output a sequence of chars. - const seq = this.decodeTableSeq[SEQ_START - uCode]; - for (let k = 0; k < seq.length - 1; k++) { - uCode = seq[k]; - newBuf[j++] = uCode & 0xff; - newBuf[j++] = uCode >> 8; - } - uCode = seq[seq.length - 1]; - } else - throw new Error( - `iconv-lite internal error: invalid decoding table value ${uCode} at ${nodeIdx}/${curByte}` - ); + // Static data + this.decodeTables = codec.decodeTables; + this.decodeTableSeq = codec.decodeTableSeq; + this.defaultCharUnicode = codec.defaultCharUnicode; + this.gb18030 = codec.gb18030; + } - // Write the character to buffer, handling higher planes using surrogate pair. - if (uCode >= 0x10000) { - uCode -= 0x10000; - const uCodeLead = 0xd800 | (uCode >> 10); - newBuf[j++] = uCodeLead & 0xff; - newBuf[j++] = uCodeLead >> 8; + write(buf) { + const chars = this.backend.allocRawChars(buf.length), + prevBytes = this.prevBytes, + prevOffset = this.prevBytes.length; + + let nodeIdx = this.nodeIdx, + seqStart = -this.prevBytes.length, // idx of the start of current parsed sequence. + charPos = 0; + + for (let i = 0; i < buf.length; i++) { + const curByte = i >= 0 ? buf[i] : prevBytes[i + prevOffset]; + + // TODO: Check curByte is number 0 <= < 256 + + // Lookup in current trie node. + let uCode = this.decodeTables[nodeIdx][curByte]; + + if (uCode >= 0) { + // Normal character, just use it. + } else if (uCode === UNASSIGNED) { + // Unknown char. + // TODO: Callback with seq. + uCode = this.defaultCharUnicode.charCodeAt(0); + i = seqStart; // Skip one byte ('i' will be incremented by the for loop) and try to parse again. + } else if (uCode === GB18030_CODE) { + const b1 = i >= 3 ? buf[i - 3] : prevBytes[i - 3 + prevOffset]; + const b2 = i >= 2 ? buf[i - 2] : prevBytes[i - 2 + prevOffset]; + const b3 = i >= 1 ? buf[i - 1] : prevBytes[i - 1 + prevOffset]; + const ptr = + (b1 - 0x81) * 12600 + (b2 - 0x30) * 1260 + (b3 - 0x81) * 10 + (curByte - 0x30); + const idx = findIdx(this.gb18030.gbChars, ptr); + uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; + } else if (uCode <= NODE_START) { + // Go to next trie node. + nodeIdx = NODE_START - uCode; + continue; + } else if (uCode <= SEQ_START) { + // Output a sequence of chars. + const seq = this.decodeTableSeq[SEQ_START - uCode]; + for (let k = 0; k < seq.length - 1; k++) { + uCode = seq[k]; + chars[charPos++] = uCode; + } + uCode = seq[seq.length - 1]; + } else + throw new Error( + `iconv-lite internal error: invalid decoding table value ${uCode} at ${nodeIdx}/${curByte}` + ); + + // Write the character to buffer, handling higher planes using surrogate pair. + if (uCode >= 0x10000) { + uCode -= 0x10000; + const uCodeLead = 0xd800 | (uCode >> 10); + chars[charPos++] = uCodeLead; + + uCode = 0xdc00 | (uCode & 0x3ff); + } + chars[charPos++] = uCode; - uCode = 0xdc00 | (uCode & 0x3ff); + // Reset trie node. + nodeIdx = 0; + seqStart = i + 1; } - newBuf[j++] = uCode & 0xff; - newBuf[j++] = uCode >> 8; - // Reset trie node. - nodeIdx = 0; - seqStart = i + 1; - } + this.nodeIdx = nodeIdx; + this.prevBytes = + seqStart >= 0 + ? Array.prototype.slice.call(buf, seqStart) + : prevBytes.slice(seqStart + prevOffset).concat(Array.prototype.slice.call(buf)); - this.nodeIdx = nodeIdx; - this.prevBytes = - seqStart >= 0 - ? Array.prototype.slice.call(buf, seqStart) - : prevBytes.slice(seqStart + prevOffset).concat(Array.prototype.slice.call(buf)); + return this.backend.rawCharsToResult(chars, charPos); + } - return newBuf.slice(0, j).toString("ucs2"); -}; + end() { + let ret = ""; -DBCSDecoder.prototype.end = function () { - let ret = ""; + // Try to parse all remaining chars. + while (this.prevBytes.length > 0) { + // Skip 1 character in the buffer. + ret += this.defaultCharUnicode; + const bytesArr = this.prevBytes.slice(1); - // Try to parse all remaining chars. - while (this.prevBytes.length > 0) { - // Skip 1 character in the buffer. - ret += this.defaultCharUnicode; - const bytesArr = this.prevBytes.slice(1); + // Parse remaining as usual. + this.prevBytes = []; + this.nodeIdx = 0; + if (bytesArr.length > 0) ret += this.write(bytesArr); + } - // Parse remaining as usual. this.prevBytes = []; this.nodeIdx = 0; - if (bytesArr.length > 0) ret += this.write(bytesArr); - } - this.prevBytes = []; - this.nodeIdx = 0; - return ret; -}; + return ret; + } +} // Binary search for GB18030. Returns largest i such that table[i] <= val. function findIdx(table, val) { diff --git a/test/gbkFile.txt b/generation/fixtures/gbkFile.txt similarity index 100% rename from test/gbkFile.txt rename to generation/fixtures/gbkFile.txt diff --git a/generation/gen-gbk-big5-fixtures.js b/generation/gen-gbk-big5-fixtures.js new file mode 100644 index 0000000..595c262 --- /dev/null +++ b/generation/gen-gbk-big5-fixtures.js @@ -0,0 +1,41 @@ +"use strict"; + +const Iconv = require("iconv").Iconv, + fs = require("fs"), + path = require("path"), + utils = require("../test/utils"); + +const fixtures = { + big5: big5(), + gbk: gbk(), +}; +const outputFile = path.resolve(__dirname, "..", "test", "fixtures", "gbk-big5.json"); +fs.writeFileSync(outputFile, JSON.stringify(fixtures)); + +function gbk() { + const inputFile = path.resolve(__dirname, "fixtures", "gbkFile.txt"); + const contentBuffer = fs.readFileSync(inputFile); + + const codec = Iconv("GBK", "utf8"); + const str = codec.convert(contentBuffer).toString(); + + return { + bytes: utils.hex(contentBuffer, true), + string: str, + }; +} + +function big5() { + const contentBuffer = Buffer.from( + "PEhUTUw+DQo8SEVBRD4gICAgDQoJPFRJVExFPiBtZXRhILzQxdKquqjPpc6hR6SkpOW69K22IDwvVElUTEU+DQoJPG1ldGEgSFRUUC1FUVVJVj0iQ29udGVudC1UeXBlIiBDT05URU5UPSJ0ZXh0L2h0bWw7IGNoYXJzZXQ9YmlnNSI+DQo8L0hFQUQ+DQo8Qk9EWT4NCg0Ks2+sT6RArdPBY8XppKSk5br0rbahSTxicj4NCihUaGlzIHBhZ2UgdXNlcyBiaWc1IGNoYXJhY3RlciBzZXQuKTxicj4NCmNoYXJzZXQ9YmlnNQ0KDQo8L0JPRFk+DQo8L0hUTUw+", + "base64" + ); + + const codec = Iconv("big5", "utf8"); + const str = codec.convert(contentBuffer).toString(); + + return { + bytes: utils.hex(contentBuffer, true), + string: str, + }; +} diff --git a/test/big5-test.js b/test/big5-test.js index c7a7a38..4c492f3 100644 --- a/test/big5-test.js +++ b/test/big5-test.js @@ -1,71 +1,68 @@ "use strict"; -var assert = require("assert"), - Buffer = require("safer-buffer").Buffer, - iconv = require("../"); +const assert = require("assert"), + utils = require("./utils"), + fixtures = require("./fixtures/gbk-big5.json"), + iconv = utils.requireIconv(); -var testString = "中文abc", //unicode contains Big5-code and ascii - testStringBig5Buffer = Buffer.from([0xa4, 0xa4, 0xa4, 0xe5, 0x61, 0x62, 0x63]), +const testString = "中文abc", //unicode contains Big5-code and ascii + testStringBig5Buffer = utils.bytes("a4 a4 a4 e5 61 62 63"), testString2 = "測試", - testStringBig5Buffer2 = Buffer.from([0xb4, 0xfa, 0xb8, 0xd5]); + testStringBig5Buffer2 = utils.bytes("b4 fa b8 d5"); -describe("Big5 tests", function () { +describe("Big5 tests #node-web", function () { it("Big5 correctly encoded/decoded", function () { assert.strictEqual( - iconv.encode(testString, "big5").toString("hex"), - testStringBig5Buffer.toString("hex") + utils.hex(iconv.encode(testString, "big5")), + utils.hex(testStringBig5Buffer) ); assert.strictEqual(iconv.decode(testStringBig5Buffer, "big5"), testString); assert.strictEqual( - iconv.encode(testString2, "big5").toString("hex"), - testStringBig5Buffer2.toString("hex") + utils.hex(iconv.encode(testString2, "big5")), + utils.hex(testStringBig5Buffer2) ); assert.strictEqual(iconv.decode(testStringBig5Buffer2, "big5"), testString2); }); it("cp950 correctly encoded/decoded", function () { assert.strictEqual( - iconv.encode(testString, "cp950").toString("hex"), - testStringBig5Buffer.toString("hex") + utils.hex(iconv.encode(testString, "cp950")), + utils.hex(testStringBig5Buffer) ); assert.strictEqual(iconv.decode(testStringBig5Buffer, "cp950"), testString); }); it("Big5 file read decoded,compare with iconv result", function () { - var contentBuffer = Buffer.from( - "PEhUTUw+DQo8SEVBRD4gICAgDQoJPFRJVExFPiBtZXRhILzQxdKquqjPpc6hR6SkpOW69K22IDwvVElUTEU+DQoJPG1ldGEgSFRUUC1FUVVJVj0iQ29udGVudC1UeXBlIiBDT05URU5UPSJ0ZXh0L2h0bWw7IGNoYXJzZXQ9YmlnNSI+DQo8L0hFQUQ+DQo8Qk9EWT4NCg0Ks2+sT6RArdPBY8XppKSk5br0rbahSTxicj4NCihUaGlzIHBhZ2UgdXNlcyBiaWc1IGNoYXJhY3RlciBzZXQuKTxicj4NCmNoYXJzZXQ9YmlnNQ0KDQo8L0JPRFk+DQo8L0hUTUw+", - "base64" - ); - var str = iconv.decode(contentBuffer, "big5"); - var iconvc = new (require("iconv").Iconv)("big5", "utf8"); - assert.strictEqual(iconvc.convert(contentBuffer).toString(), str); + const contentBuffer = utils.bytes(fixtures.big5.bytes); + const str = iconv.decode(contentBuffer, "big5"); + assert.strictEqual(fixtures.big5.string, str); }); it("Big5 correctly decodes and encodes characters · and ×", function () { // https://github.com/ashtuchkin/iconv-lite/issues/13 // Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT - var chars = "·×"; - var big5Chars = Buffer.from([0xa1, 0x50, 0xa1, 0xd1]); - assert.strictEqual(iconv.encode(chars, "big5").toString("hex"), big5Chars.toString("hex")); + const chars = "·×"; + const big5Chars = utils.bytes("a1 50 a1 d1"); + assert.strictEqual(utils.hex(iconv.encode(chars, "big5")), utils.hex(big5Chars)); assert.strictEqual(iconv.decode(big5Chars, "big5"), chars); }); it("Big5 correctly encodes & decodes sequences", function () { - assert.strictEqual(iconv.encode("\u00CA\u0304", "big5").toString("hex"), "8862"); - assert.strictEqual(iconv.encode("\u00EA\u030C", "big5").toString("hex"), "88a5"); - assert.strictEqual(iconv.encode("\u00CA", "big5").toString("hex"), "8866"); - assert.strictEqual(iconv.encode("\u00CA\u00CA", "big5").toString("hex"), "88668866"); + assert.strictEqual(utils.hex(iconv.encode("\u00CA\u0304", "big5")), "88 62"); + assert.strictEqual(utils.hex(iconv.encode("\u00EA\u030C", "big5")), "88 a5"); + assert.strictEqual(utils.hex(iconv.encode("\u00CA", "big5")), "88 66"); + assert.strictEqual(utils.hex(iconv.encode("\u00CA\u00CA", "big5")), "88 66 88 66"); - assert.strictEqual(iconv.encode("\u00CA\uD800", "big5").toString("hex"), "88663f"); // Unfinished surrogate. - assert.strictEqual(iconv.encode("\u00CA\uD841\uDD47", "big5").toString("hex"), "8866fa40"); // Finished surrogate ('𠕇'). - assert.strictEqual(iconv.encode("\u00CA𠕇", "big5").toString("hex"), "8866fa40"); // Finished surrogate ('𠕇'). + assert.strictEqual(utils.hex(iconv.encode("\u00CA\uD800", "big5")), "88 66 3f"); // Unfinished surrogate. + assert.strictEqual(utils.hex(iconv.encode("\u00CA\uD841\uDD47", "big5")), "88 66 fa 40"); // Finished surrogate ('𠕇'). + assert.strictEqual(utils.hex(iconv.encode("\u00CA𠕇", "big5")), "88 66 fa 40"); // Finished surrogate ('𠕇'). - assert.strictEqual(iconv.decode(Buffer.from("8862", "hex"), "big5"), "\u00CA\u0304"); - assert.strictEqual(iconv.decode(Buffer.from("8866", "hex"), "big5"), "\u00CA"); - assert.strictEqual(iconv.decode(Buffer.from("8866fa40", "hex"), "big5"), "\u00CA𠕇"); + assert.strictEqual(iconv.decode(utils.bytes("88 62"), "big5"), "\u00CA\u0304"); + assert.strictEqual(iconv.decode(utils.bytes("88 66"), "big5"), "\u00CA"); + assert.strictEqual(iconv.decode(utils.bytes("88 66 fa 40"), "big5"), "\u00CA𠕇"); }); it("Big5 correctly encodes 十", function () { - assert.strictEqual(iconv.encode("十", "big5").toString("hex"), "a451"); + assert.strictEqual(utils.hex(iconv.encode("十", "big5")), "a4 51"); }); }); diff --git a/test/fixtures/gbk-big5.json b/test/fixtures/gbk-big5.json new file mode 100644 index 0000000..2a00670 --- /dev/null +++ b/test/fixtures/gbk-big5.json @@ -0,0 +1,10 @@ +{ + "big5": { + "bytes": "3c 48 54 4d 4c 3e 0d 0a 3c 48 45 41 44 3e 20 20 20 20 0d 0a 09 3c 54 49 54 4c 45 3e 20 6d 65 74 61 20 bc d0 c5 d2 aa ba a8 cf a5 ce a1 47 a4 a4 a4 e5 ba f4 ad b6 20 3c 2f 54 49 54 4c 45 3e 0d 0a 09 3c 6d 65 74 61 20 48 54 54 50 2d 45 51 55 49 56 3d 22 43 6f 6e 74 65 6e 74 2d 54 79 70 65 22 20 43 4f 4e 54 45 4e 54 3d 22 74 65 78 74 2f 68 74 6d 6c 3b 20 63 68 61 72 73 65 74 3d 62 69 67 35 22 3e 0d 0a 3c 2f 48 45 41 44 3e 0d 0a 3c 42 4f 44 59 3e 0d 0a 0d 0a b3 6f ac 4f a4 40 ad d3 c1 63 c5 e9 a4 a4 a4 e5 ba f4 ad b6 a1 49 3c 62 72 3e 0d 0a 28 54 68 69 73 20 70 61 67 65 20 75 73 65 73 20 62 69 67 35 20 63 68 61 72 61 63 74 65 72 20 73 65 74 2e 29 3c 62 72 3e 0d 0a 63 68 61 72 73 65 74 3d 62 69 67 35 0d 0a 0d 0a 3c 2f 42 4f 44 59 3e 0d 0a 3c 2f 48 54 4d 4c 3e", + "string": "\r\n
\r\n\t