From 5d99a923f2bb9352abf80f8aeb850d924a8a1e38 Mon Sep 17 00:00:00 2001 From: Fedor Nezhivoi Date: Sun, 26 Jul 2020 18:12:43 +1000 Subject: [PATCH] Convert dbcs codec and tests (#256) --- encodings/dbcs-codec.js | 1014 +++++++++++---------- {test => generation/fixtures}/gbkFile.txt | 0 generation/gen-gbk-big5-fixtures.js | 41 + test/big5-test.js | 65 +- test/fixtures/gbk-big5.json | 10 + test/gbk-test.js | 130 ++- test/shiftjis-test.js | 38 +- test/webpack/iconv-lite-tests.js | 3 + 8 files changed, 673 insertions(+), 628 deletions(-) rename {test => generation/fixtures}/gbkFile.txt (100%) create mode 100644 generation/gen-gbk-big5-fixtures.js create mode 100644 test/fixtures/gbk-big5.json diff --git a/encodings/dbcs-codec.js b/encodings/dbcs-codec.js index 3294ed9..456c8ac 100644 --- a/encodings/dbcs-codec.js +++ b/encodings/dbcs-codec.js @@ -1,12 +1,9 @@ "use strict"; -var Buffer = require("safer-buffer").Buffer; // Multibyte codec. In this scheme, a character is represented by 1 or more bytes. // Our codec supports UTF-16 surrogates, extensions for GB18030 and unicode sequences. // To save memory and loading time, we read table files only when requested. -exports._dbcs = DBCSCodec; - const UNASSIGNED = -1, GB18030_CODE = -2, SEQ_START = -10, @@ -21,588 +18,603 @@ for (let i = 0; i < 0x100; i++) { } // Class DBCSCodec reads and initializes mapping tables. -function DBCSCodec(codecOptions, iconv) { - this.encodingName = codecOptions.encodingName; - if (!codecOptions) throw new Error("DBCS codec is called without the data."); - if (!codecOptions.table) throw new Error("Encoding '" + this.encodingName + "' has no data."); - - // Load tables. - const mappingTable = codecOptions.table(); - - // Decode tables: MBCS -> Unicode. - - // decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256. - // Trie root is decodeTables[0]. - // Values: >= 0 -> unicode character code. can be > 0xFFFF - // == UNASSIGNED -> unknown/unassigned sequence. - // == GB18030_CODE -> this is the end of a GB18030 4-byte sequence. - // <= NODE_START -> index of the next node in our trie to process next byte. - // <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq. - this.decodeTables = []; - this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. - - // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. - this.decodeTableSeq = []; - - // Actual mapping tables consist of chunks. Use them to fill up decode tables. - for (let i = 0; i < mappingTable.length; i++) this._addDecodeChunk(mappingTable[i]); - - // Load & create GB18030 tables when needed. - if (typeof codecOptions.gb18030 === "function") { - this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges. - - // Add GB18030 common decode nodes. - const commonThirdByteNodeIdx = this.decodeTables.length; - this.decodeTables.push(UNASSIGNED_NODE.slice(0)); - - const commonFourthByteNodeIdx = this.decodeTables.length; - this.decodeTables.push(UNASSIGNED_NODE.slice(0)); - - // Fill out the tree - const firstByteNode = this.decodeTables[0]; - for (let i = 0x81; i <= 0xfe; i++) { - const secondNodeIdx = NODE_START - firstByteNode[i]; - const secondByteNode = this.decodeTables[secondNodeIdx]; - for (let j = 0x30; j <= 0x39; j++) { - if (secondByteNode[j] === UNASSIGNED) { - secondByteNode[j] = NODE_START - commonThirdByteNodeIdx; - } else if (secondByteNode[j] > NODE_START) { - throw new Error("gb18030 decode tables conflict at byte 2"); - } - - const thirdNodeIdx = NODE_START - secondByteNode[j]; - const thirdByteNode = this.decodeTables[thirdNodeIdx]; - for (let k = 0x81; k <= 0xfe; k++) { - const commonFourthNodeIdx = NODE_START - commonFourthByteNodeIdx; - if (thirdByteNode[k] === UNASSIGNED) { - thirdByteNode[k] = commonFourthNodeIdx; - } else if (thirdByteNode[k] === commonFourthNodeIdx) { - continue; - } else if (thirdByteNode[k] > NODE_START) { - throw new Error("gb18030 decode tables conflict at byte 3"); +exports._dbcs = class DBCSCodec { + constructor(codecOptions, iconv) { + this.encodingName = codecOptions.encodingName; + if (!codecOptions) throw new Error("DBCS codec is called without the data."); + if (!codecOptions.table) + throw new Error("Encoding '" + this.encodingName + "' has no data."); + + // Load tables. + const mappingTable = codecOptions.table(); + + // Decode tables: MBCS -> Unicode. + + // decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256. + // Trie root is decodeTables[0]. + // Values: >= 0 -> unicode character code. can be > 0xFFFF + // == UNASSIGNED -> unknown/unassigned sequence. + // == GB18030_CODE -> this is the end of a GB18030 4-byte sequence. + // <= NODE_START -> index of the next node in our trie to process next byte. + // <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq. + this.decodeTables = []; + this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. + + // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. + this.decodeTableSeq = []; + + // Actual mapping tables consist of chunks. Use them to fill up decode tables. + for (let i = 0; i < mappingTable.length; i++) this._addDecodeChunk(mappingTable[i]); + + // Load & create GB18030 tables when needed. + if (typeof codecOptions.gb18030 === "function") { + this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges. + + // Add GB18030 common decode nodes. + const commonThirdByteNodeIdx = this.decodeTables.length; + this.decodeTables.push(UNASSIGNED_NODE.slice(0)); + + const commonFourthByteNodeIdx = this.decodeTables.length; + this.decodeTables.push(UNASSIGNED_NODE.slice(0)); + + // Fill out the tree + const firstByteNode = this.decodeTables[0]; + for (let i = 0x81; i <= 0xfe; i++) { + const secondNodeIdx = NODE_START - firstByteNode[i]; + const secondByteNode = this.decodeTables[secondNodeIdx]; + for (let j = 0x30; j <= 0x39; j++) { + if (secondByteNode[j] === UNASSIGNED) { + secondByteNode[j] = NODE_START - commonThirdByteNodeIdx; + } else if (secondByteNode[j] > NODE_START) { + throw new Error("gb18030 decode tables conflict at byte 2"); } - const fourthNodeIdx = NODE_START - thirdByteNode[k]; - const fourthByteNode = this.decodeTables[fourthNodeIdx]; - for (let l = 0x30; l <= 0x39; l++) { - if (fourthByteNode[l] === UNASSIGNED) fourthByteNode[l] = GB18030_CODE; + const thirdNodeIdx = NODE_START - secondByteNode[j]; + const thirdByteNode = this.decodeTables[thirdNodeIdx]; + for (let k = 0x81; k <= 0xfe; k++) { + const commonFourthNodeIdx = NODE_START - commonFourthByteNodeIdx; + if (thirdByteNode[k] === UNASSIGNED) { + thirdByteNode[k] = commonFourthNodeIdx; + } else if (thirdByteNode[k] === commonFourthNodeIdx) { + continue; + } else if (thirdByteNode[k] > NODE_START) { + throw new Error("gb18030 decode tables conflict at byte 3"); + } + + const fourthNodeIdx = NODE_START - thirdByteNode[k]; + const fourthByteNode = this.decodeTables[fourthNodeIdx]; + for (let l = 0x30; l <= 0x39; l++) { + if (fourthByteNode[l] === UNASSIGNED) fourthByteNode[l] = GB18030_CODE; + } } } } } - } - this.defaultCharUnicode = iconv.defaultCharUnicode; - - // Encode tables: Unicode -> DBCS. - - // `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. - // Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null. - // Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.). - // == UNASSIGNED -> no conversion found. Output a default char. - // <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. - this.encodeTable = []; - - // `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of - // objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key - // means end of sequence (needed when one sequence is a strict subsequence of another). - // Objects are kept separately from encodeTable to increase performance. - this.encodeTableSeq = []; - - // Some chars can be decoded, but need not be encoded. - const skipEncodeChars = {}; - if (codecOptions.encodeSkipVals) - for (let i = 0; i < codecOptions.encodeSkipVals.length; i++) { - const val = codecOptions.encodeSkipVals[i]; - if (typeof val === "number") { - skipEncodeChars[val] = true; - } else { - for (let j = val.from; j <= val.to; j++) skipEncodeChars[j] = true; + this.defaultCharUnicode = iconv.defaultCharUnicode; + + // Encode tables: Unicode -> DBCS. + + // `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. + // Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null. + // Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.). + // == UNASSIGNED -> no conversion found. Output a default char. + // <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. + this.encodeTable = []; + + // `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of + // objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key + // means end of sequence (needed when one sequence is a strict subsequence of another). + // Objects are kept separately from encodeTable to increase performance. + this.encodeTableSeq = []; + + // Some chars can be decoded, but need not be encoded. + const skipEncodeChars = {}; + if (codecOptions.encodeSkipVals) + for (let i = 0; i < codecOptions.encodeSkipVals.length; i++) { + const val = codecOptions.encodeSkipVals[i]; + if (typeof val === "number") { + skipEncodeChars[val] = true; + } else { + for (let j = val.from; j <= val.to; j++) skipEncodeChars[j] = true; + } } - } - // Use decode trie to recursively fill out encode tables. - this._fillEncodeTable(0, 0, skipEncodeChars); + // Use decode trie to recursively fill out encode tables. + this._fillEncodeTable(0, 0, skipEncodeChars); + + // Add more encoding pairs when needed. + if (codecOptions.encodeAdd) { + for (const uChar in codecOptions.encodeAdd) { + if (hasOwnProperty.call(codecOptions.encodeAdd, uChar)) + this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]); + } + } - // Add more encoding pairs when needed. - if (codecOptions.encodeAdd) { - for (const uChar in codecOptions.encodeAdd) { - if (hasOwnProperty.call(codecOptions.encodeAdd, uChar)) - this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]); + this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)]; + if (this.defCharSB === UNASSIGNED) { + this.defCharSB = this.encodeTable[0]["?"]; + } + if (this.defCharSB === UNASSIGNED) { + this.defCharSB = "?".charCodeAt(0); } } - this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)]; - if (this.defCharSB === UNASSIGNED) { - this.defCharSB = this.encodeTable[0]["?"]; + get decoder() { + return DBCSDecoder; } - if (this.defCharSB === UNASSIGNED) { - this.defCharSB = "?".charCodeAt(0); + + get encoder() { + return DBCSEncoder; } -} -DBCSCodec.prototype.encoder = DBCSEncoder; -DBCSCodec.prototype.decoder = DBCSDecoder; - -// Decoder helpers -DBCSCodec.prototype._getDecodeTrieNode = function (addr) { - const bytes = []; - for (; addr > 0; addr >>>= 8) bytes.push(addr & 0xff); - if (bytes.length === 0) bytes.push(0); - - let node = this.decodeTables[0]; - for (let i = bytes.length - 1; i > 0; i--) { - // Traverse nodes deeper into the trie. - const val = node[bytes[i]]; - - if (val === UNASSIGNED) { - // Create new node. - node[bytes[i]] = NODE_START - this.decodeTables.length; - this.decodeTables.push((node = UNASSIGNED_NODE.slice(0))); - } else if (val <= NODE_START) { - // Existing node. - node = this.decodeTables[NODE_START - val]; - } else { - const hexAddr = addr.toString(16); - throw new Error(`Overwrite byte in ${this.encodingName}, addr: ${hexAddr}`); + _getDecodeTrieNode(addr) { + const bytes = []; + for (; addr > 0; addr >>>= 8) bytes.push(addr & 0xff); + if (bytes.length === 0) bytes.push(0); + + let node = this.decodeTables[0]; + for (let i = bytes.length - 1; i > 0; i--) { + // Traverse nodes deeper into the trie. + const val = node[bytes[i]]; + + if (val === UNASSIGNED) { + // Create new node. + node[bytes[i]] = NODE_START - this.decodeTables.length; + this.decodeTables.push((node = UNASSIGNED_NODE.slice(0))); + } else if (val <= NODE_START) { + // Existing node. + node = this.decodeTables[NODE_START - val]; + } else { + const hexAddr = addr.toString(16); + throw new Error(`Overwrite byte in ${this.encodingName}, addr: ${hexAddr}`); + } } + return node; } - return node; -}; -DBCSCodec.prototype._addDecodeChunk = function (chunk) { - // First element of chunk is the hex mbcs code where we start. - let curAddr = parseInt(chunk[0], 16); - - // Choose the decoding node where we'll write our chars. - const writeTable = this._getDecodeTrieNode(curAddr); - curAddr = curAddr & 0xff; - - // Write all other elements of the chunk to the table. - for (let k = 1; k < chunk.length; k++) { - const part = chunk[k]; - if (typeof part === "string") { - // String, write as-is. - for (let l = 0; l < part.length; ) { - const code = part.charCodeAt(l++); - if (0xd800 <= code && code < 0xdc00) { - // Decode surrogate - const codeTrail = part.charCodeAt(l++); - if (0xdc00 <= codeTrail && codeTrail < 0xe000) { - writeTable[curAddr++] = - 0x10000 + (code - 0xd800) * 0x400 + (codeTrail - 0xdc00); + _addDecodeChunk(chunk) { + // First element of chunk is the hex mbcs code where we start. + let curAddr = parseInt(chunk[0], 16); + + // Choose the decoding node where we'll write our chars. + const writeTable = this._getDecodeTrieNode(curAddr); + curAddr = curAddr & 0xff; + + // Write all other elements of the chunk to the table. + for (let k = 1; k < chunk.length; k++) { + const part = chunk[k]; + if (typeof part === "string") { + // String, write as-is. + for (let l = 0; l < part.length; ) { + const code = part.charCodeAt(l++); + if (0xd800 <= code && code < 0xdc00) { + // Decode surrogate + const codeTrail = part.charCodeAt(l++); + if (0xdc00 <= codeTrail && codeTrail < 0xe000) { + writeTable[curAddr++] = + 0x10000 + (code - 0xd800) * 0x400 + (codeTrail - 0xdc00); + } else { + throw new Error( + `Incorrect surrogate pair in ${this.encodingName} at chunk ${chunk[0]}` + ); + } + } else if (0x0ff0 < code && code <= 0x0fff) { + // Character sequence (our own encoding used) + const len = 0xfff - code + 2; + const seq = []; + for (let m = 0; m < len; m++) { + // Simple variation: don't support surrogates or subsequences in seq. + seq.push(part.charCodeAt(l++)); + } + + writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length; + this.decodeTableSeq.push(seq); } else { - throw new Error( - `Incorrect surrogate pair in ${this.encodingName} at chunk ${chunk[0]}` - ); + writeTable[curAddr++] = code; // Basic char } - } else if (0x0ff0 < code && code <= 0x0fff) { - // Character sequence (our own encoding used) - const len = 0xfff - code + 2; - const seq = []; - for (let m = 0; m < len; m++) { - // Simple variation: don't support surrogates or subsequences in seq. - seq.push(part.charCodeAt(l++)); - } - - writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length; - this.decodeTableSeq.push(seq); - } else { - writeTable[curAddr++] = code; // Basic char } - } - } else if (typeof part === "number") { - // Integer, meaning increasing sequence starting with prev character. - let charCode = writeTable[curAddr - 1] + 1; - for (let l = 0; l < part; l++) { - writeTable[curAddr++] = charCode++; - } - } else + } else if (typeof part === "number") { + // Integer, meaning increasing sequence starting with prev character. + let charCode = writeTable[curAddr - 1] + 1; + for (let l = 0; l < part; l++) { + writeTable[curAddr++] = charCode++; + } + } else + throw new Error( + `Incorrect type '${typeof part}' given in ${this.encodingName} at chunk ${ + chunk[0] + }` + ); + } + if (curAddr > 0xff) throw new Error( - `Incorrect type '${typeof part}' given in ${this.encodingName} at chunk ${chunk[0]}` + `Incorrect chunk in ${this.encodingName} at addr ${chunk[0]}: too long ${curAddr}` ); } - if (curAddr > 0xff) - throw new Error( - `Incorrect chunk in ${this.encodingName} at addr ${chunk[0]}: too long ${curAddr}` - ); -}; - -// Encoder helpers -DBCSCodec.prototype._getEncodeBucket = function (uCode) { - const high = uCode >> 8; // This could be > 0xFF because of astral characters. - if (this.encodeTable[high] === undefined) this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand. - return this.encodeTable[high]; -}; -DBCSCodec.prototype._setEncodeChar = function (uCode, dbcsCode) { - const bucket = this._getEncodeBucket(uCode); - const low = uCode & 0xff; - if (bucket[low] <= SEQ_START) { - // There's already a sequence, set a single-char subsequence of it. - this.encodeTableSeq[SEQ_START - bucket[low]][DEF_CHAR] = dbcsCode; - } else if (bucket[low] === UNASSIGNED) { - bucket[low] = dbcsCode; + _getEncodeBucket(uCode) { + const high = uCode >> 8; // This could be > 0xFF because of astral characters. + if (this.encodeTable[high] === undefined) this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand. + return this.encodeTable[high]; } -}; -DBCSCodec.prototype._setEncodeSequence = function (seq, dbcsCode) { - // Get the root of character tree according to first character of the sequence. - const uCode = seq[0]; - const bucket = this._getEncodeBucket(uCode); - const low = uCode & 0xff; - - let node; - if (bucket[low] <= SEQ_START) { - // There's already a sequence with - use it. - node = this.encodeTableSeq[SEQ_START - bucket[low]]; - } else { - // There was no sequence object - allocate a new one. - node = {}; - if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence. - bucket[low] = SEQ_START - this.encodeTableSeq.length; - this.encodeTableSeq.push(node); + _setEncodeChar(uCode, dbcsCode) { + const bucket = this._getEncodeBucket(uCode); + const low = uCode & 0xff; + if (bucket[low] <= SEQ_START) { + // There's already a sequence, set a single-char subsequence of it. + this.encodeTableSeq[SEQ_START - bucket[low]][DEF_CHAR] = dbcsCode; + } else if (bucket[low] === UNASSIGNED) { + bucket[low] = dbcsCode; + } } - // Traverse the character tree, allocating new nodes as needed. - for (let j = 1; j < seq.length - 1; j++) { - const oldVal = node[uCode]; - if (typeof oldVal === "object") { - node = oldVal; + _setEncodeSequence(seq, dbcsCode) { + // Get the root of character tree according to first character of the sequence. + const uCode = seq[0]; + const bucket = this._getEncodeBucket(uCode); + const low = uCode & 0xff; + + let node; + if (bucket[low] <= SEQ_START) { + // There's already a sequence with - use it. + node = this.encodeTableSeq[SEQ_START - bucket[low]]; } else { - node = node[uCode] = {}; - if (oldVal !== undefined) node[DEF_CHAR] = oldVal; + // There was no sequence object - allocate a new one. + node = {}; + if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence. + bucket[low] = SEQ_START - this.encodeTableSeq.length; + this.encodeTableSeq.push(node); } - } - // Set the leaf to given dbcsCode. - const uCode2 = seq[seq.length - 1]; - node[uCode2] = dbcsCode; -}; + // Traverse the character tree, allocating new nodes as needed. + for (let j = 1; j < seq.length - 1; j++) { + const oldVal = node[uCode]; + if (typeof oldVal === "object") { + node = oldVal; + } else { + node = node[uCode] = {}; + if (oldVal !== undefined) node[DEF_CHAR] = oldVal; + } + } -DBCSCodec.prototype._fillEncodeTable = function (nodeIdx, prefix, skipEncodeChars) { - const node = this.decodeTables[nodeIdx]; - let hasValues = false; - const subNodeEmpty = {}; - for (let i = 0; i < 0x100; i++) { - const uCode = node[i]; - const mbCode = prefix + i; - if (skipEncodeChars[mbCode]) continue; - - if (uCode >= 0) { - this._setEncodeChar(uCode, mbCode); - hasValues = true; - } else if (uCode <= NODE_START) { - const subNodeIdx = NODE_START - uCode; - if (!subNodeEmpty[subNodeIdx]) { - // Skip empty subtrees (they are too large in gb18030). - var newPrefix = (mbCode << 8) >>> 0; // NOTE: '>>> 0' keeps 32-bit num positive. - if (this._fillEncodeTable(subNodeIdx, newPrefix, skipEncodeChars)) { - hasValues = true; - } else { - subNodeEmpty[subNodeIdx] = true; + // Set the leaf to given dbcsCode. + const uCode2 = seq[seq.length - 1]; + node[uCode2] = dbcsCode; + } + + _fillEncodeTable(nodeIdx, prefix, skipEncodeChars) { + const node = this.decodeTables[nodeIdx]; + let hasValues = false; + const subNodeEmpty = {}; + for (let i = 0; i < 0x100; i++) { + const uCode = node[i]; + const mbCode = prefix + i; + if (skipEncodeChars[mbCode]) continue; + + if (uCode >= 0) { + this._setEncodeChar(uCode, mbCode); + hasValues = true; + } else if (uCode <= NODE_START) { + const subNodeIdx = NODE_START - uCode; + if (!subNodeEmpty[subNodeIdx]) { + // Skip empty subtrees (they are too large in gb18030). + var newPrefix = (mbCode << 8) >>> 0; // NOTE: '>>> 0' keeps 32-bit num positive. + if (this._fillEncodeTable(subNodeIdx, newPrefix, skipEncodeChars)) { + hasValues = true; + } else { + subNodeEmpty[subNodeIdx] = true; + } } + } else if (uCode <= SEQ_START) { + this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode); + hasValues = true; } - } else if (uCode <= SEQ_START) { - this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode); - hasValues = true; } + return hasValues; } - return hasValues; }; // == Encoder ================================================================== -function DBCSEncoder(options, codec) { - // Encoder state - this.leadSurrogate = -1; - this.seqObj = undefined; - - // Static data - this.encodeTable = codec.encodeTable; - this.encodeTableSeq = codec.encodeTableSeq; - this.defaultCharSingleByte = codec.defCharSB; - this.gb18030 = codec.gb18030; -} +class DBCSEncoder { + constructor(options, codec, backend) { + this.backend = backend; + // Encoder state + this.leadSurrogate = -1; + this.seqObj = undefined; -DBCSEncoder.prototype.write = function (str) { - const newBuf = Buffer.alloc(str.length * (this.gb18030 ? 4 : 3)); - let leadSurrogate = this.leadSurrogate, - seqObj = this.seqObj, - nextChar = -1, - i = 0, - j = 0; - - for (;;) { - // 0. Get next character. - let uCode; - if (nextChar === -1) { - if (i === str.length) break; - uCode = str.charCodeAt(i++); - } else { - uCode = nextChar; - nextChar = -1; - } + // Static data + this.encodeTable = codec.encodeTable; + this.encodeTableSeq = codec.encodeTableSeq; + this.defaultCharSingleByte = codec.defCharSB; + this.gb18030 = codec.gb18030; + } - // 1. Handle surrogates. - if (0xd800 <= uCode && uCode < 0xe000) { - // Char is one of surrogates. - if (uCode < 0xdc00) { - // We've got a lead surrogate. - if (leadSurrogate === -1) { - leadSurrogate = uCode; - continue; - } else { - leadSurrogate = uCode; - // Double lead surrogate found. - uCode = UNASSIGNED; - } + write(str) { + const bytes = this.backend.allocBytes(str.length * (this.gb18030 ? 4 : 3)); + let leadSurrogate = this.leadSurrogate, + seqObj = this.seqObj, + nextChar = -1, + i = 0, + bytePos = 0; + + for (;;) { + // 0. Get next character. + let uCode; + if (nextChar === -1) { + if (i === str.length) break; + uCode = str.charCodeAt(i++); } else { - // We've got trail surrogate. - if (leadSurrogate !== -1) { - uCode = 0x10000 + (leadSurrogate - 0xd800) * 0x400 + (uCode - 0xdc00); - leadSurrogate = -1; - } else { - // Incomplete surrogate pair - only trail surrogate found. - uCode = UNASSIGNED; - } + uCode = nextChar; + nextChar = -1; } - } else if (leadSurrogate !== -1) { - // Incomplete surrogate pair - only lead surrogate found. - nextChar = uCode; - uCode = UNASSIGNED; // Write an error, then current char. - leadSurrogate = -1; - } - // 2. Convert uCode character. - let dbcsCode = UNASSIGNED; - if (seqObj !== undefined && uCode !== UNASSIGNED) { - // We are in the middle of the sequence - let resCode = seqObj[uCode]; - if (typeof resCode === "object") { - // Sequence continues. - seqObj = resCode; - continue; - } else if (typeof resCode == "number") { - // Sequence finished. Write it. - dbcsCode = resCode; - } else if (resCode === undefined) { - // Current character is not part of the sequence. - - // Try default character for this sequence - resCode = seqObj[DEF_CHAR]; - if (resCode !== undefined) { - dbcsCode = resCode; // Found. Write it. - nextChar = uCode; // Current character will be written too in the next iteration. + // 1. Handle surrogates. + if (0xd800 <= uCode && uCode < 0xe000) { + // Char is one of surrogates. + if (uCode < 0xdc00) { + // We've got a lead surrogate. + if (leadSurrogate === -1) { + leadSurrogate = uCode; + continue; + } else { + leadSurrogate = uCode; + // Double lead surrogate found. + uCode = UNASSIGNED; + } } else { - // TODO: What if we have no default? (resCode == undefined) - // Then, we should write first char of the sequence as-is and try the rest recursively. - // Didn't do it for now because no encoding has this situation yet. - // Currently, just skip the sequence and write current char. + // We've got trail surrogate. + if (leadSurrogate !== -1) { + uCode = 0x10000 + (leadSurrogate - 0xd800) * 0x400 + (uCode - 0xdc00); + leadSurrogate = -1; + } else { + // Incomplete surrogate pair - only trail surrogate found. + uCode = UNASSIGNED; + } } - } - seqObj = undefined; - } else if (uCode >= 0) { - // Regular character - const subtable = this.encodeTable[uCode >> 8]; - if (subtable !== undefined) dbcsCode = subtable[uCode & 0xff]; - - if (dbcsCode <= SEQ_START) { - // Sequence start - seqObj = this.encodeTableSeq[SEQ_START - dbcsCode]; - continue; + } else if (leadSurrogate !== -1) { + // Incomplete surrogate pair - only lead surrogate found. + nextChar = uCode; + uCode = UNASSIGNED; // Write an error, then current char. + leadSurrogate = -1; } - if (dbcsCode === UNASSIGNED && this.gb18030) { - // Use GB18030 algorithm to find character(s) to write. - const idx = findIdx(this.gb18030.uChars, uCode); - if (idx !== -1) { - dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); - newBuf[j++] = 0x81 + Math.floor(dbcsCode / 12600); - dbcsCode = dbcsCode % 12600; - newBuf[j++] = 0x30 + Math.floor(dbcsCode / 1260); - dbcsCode = dbcsCode % 1260; - newBuf[j++] = 0x81 + Math.floor(dbcsCode / 10); - dbcsCode = dbcsCode % 10; - newBuf[j++] = 0x30 + dbcsCode; + // 2. Convert uCode character. + let dbcsCode = UNASSIGNED; + if (seqObj !== undefined && uCode !== UNASSIGNED) { + // We are in the middle of the sequence + let resCode = seqObj[uCode]; + if (typeof resCode === "object") { + // Sequence continues. + seqObj = resCode; continue; + } else if (typeof resCode == "number") { + // Sequence finished. Write it. + dbcsCode = resCode; + } else if (resCode === undefined) { + // Current character is not part of the sequence. + + // Try default character for this sequence + resCode = seqObj[DEF_CHAR]; + if (resCode !== undefined) { + dbcsCode = resCode; // Found. Write it. + nextChar = uCode; // Current character will be written too in the next iteration. + } else { + // TODO: What if we have no default? (resCode == undefined) + // Then, we should write first char of the sequence as-is and try the rest recursively. + // Didn't do it for now because no encoding has this situation yet. + // Currently, just skip the sequence and write current char. + } + } + seqObj = undefined; + } else if (uCode >= 0) { + // Regular character + const subtable = this.encodeTable[uCode >> 8]; + if (subtable !== undefined) dbcsCode = subtable[uCode & 0xff]; + + if (dbcsCode <= SEQ_START) { + // Sequence start + seqObj = this.encodeTableSeq[SEQ_START - dbcsCode]; + continue; + } + + if (dbcsCode === UNASSIGNED && this.gb18030) { + // Use GB18030 algorithm to find character(s) to write. + const idx = findIdx(this.gb18030.uChars, uCode); + if (idx !== -1) { + dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); + bytes[bytePos++] = 0x81 + Math.floor(dbcsCode / 12600); + dbcsCode = dbcsCode % 12600; + bytes[bytePos++] = 0x30 + Math.floor(dbcsCode / 1260); + dbcsCode = dbcsCode % 1260; + bytes[bytePos++] = 0x81 + Math.floor(dbcsCode / 10); + dbcsCode = dbcsCode % 10; + bytes[bytePos++] = 0x30 + dbcsCode; + continue; + } } } - } - // 3. Write dbcsCode character. - if (dbcsCode === UNASSIGNED) { - dbcsCode = this.defaultCharSingleByte; - } + // 3. Write dbcsCode character. + if (dbcsCode === UNASSIGNED) { + dbcsCode = this.defaultCharSingleByte; + } - if (dbcsCode < 0x100) { - newBuf[j++] = dbcsCode; - } else if (dbcsCode < 0x10000) { - newBuf[j++] = dbcsCode >> 8; // high byte - newBuf[j++] = dbcsCode & 0xff; // low byte - } else if (dbcsCode < 0x1000000) { - newBuf[j++] = dbcsCode >> 16; - newBuf[j++] = (dbcsCode >> 8) & 0xff; - newBuf[j++] = dbcsCode & 0xff; - } else { - newBuf[j++] = dbcsCode >>> 24; - newBuf[j++] = (dbcsCode >>> 16) & 0xff; - newBuf[j++] = (dbcsCode >>> 8) & 0xff; - newBuf[j++] = dbcsCode & 0xff; + if (dbcsCode < 0x100) { + bytes[bytePos++] = dbcsCode; + } else if (dbcsCode < 0x10000) { + bytes[bytePos++] = dbcsCode >> 8; // high byte + bytes[bytePos++] = dbcsCode & 0xff; // low byte + } else if (dbcsCode < 0x1000000) { + bytes[bytePos++] = dbcsCode >> 16; + bytes[bytePos++] = (dbcsCode >> 8) & 0xff; + bytes[bytePos++] = dbcsCode & 0xff; + } else { + bytes[bytePos++] = dbcsCode >>> 24; + bytes[bytePos++] = (dbcsCode >>> 16) & 0xff; + bytes[bytePos++] = (dbcsCode >>> 8) & 0xff; + bytes[bytePos++] = dbcsCode & 0xff; + } } - } - this.seqObj = seqObj; - this.leadSurrogate = leadSurrogate; - return newBuf.slice(0, j); -}; - -DBCSEncoder.prototype.end = function () { - if (this.leadSurrogate === -1 && this.seqObj === undefined) { - return undefined; // All clean. Most often case. + this.seqObj = seqObj; + this.leadSurrogate = leadSurrogate; + return this.backend.bytesToResult(bytes, bytePos); } - const newBuf = Buffer.alloc(10); - let j = 0; + end() { + if (this.leadSurrogate === -1 && this.seqObj === undefined) { + return undefined; // All clean. Most often case. + } - if (this.seqObj) { - // We're in the sequence. - const dbcsCode = this.seqObj[DEF_CHAR]; - if (dbcsCode !== undefined) { - // Write beginning of the sequence. - if (dbcsCode < 0x100) { - newBuf[j++] = dbcsCode; + const bytes = this.backend.allocBytes(10); + let bytePos = 0; + + if (this.seqObj) { + // We're in the sequence. + const dbcsCode = this.seqObj[DEF_CHAR]; + if (dbcsCode !== undefined) { + // Write beginning of the sequence. + if (dbcsCode < 0x100) { + bytes[bytePos++] = dbcsCode; + } else { + bytes[bytePos++] = dbcsCode >> 8; // high byte + bytes[bytePos++] = dbcsCode & 0xff; // low byte + } } else { - newBuf[j++] = dbcsCode >> 8; // high byte - newBuf[j++] = dbcsCode & 0xff; // low byte + // See todo above. } - } else { - // See todo above. + this.seqObj = undefined; } - this.seqObj = undefined; - } - if (this.leadSurrogate !== -1) { - // Incomplete surrogate pair - only lead surrogate found. - newBuf[j++] = this.defaultCharSingleByte; - this.leadSurrogate = -1; - } + if (this.leadSurrogate !== -1) { + // Incomplete surrogate pair - only lead surrogate found. + bytes[bytePos++] = this.defaultCharSingleByte; + this.leadSurrogate = -1; + } - return newBuf.slice(0, j); -}; + return this.backend.bytesToResult(bytes, bytePos); + } -// Export for testing -DBCSEncoder.prototype.findIdx = findIdx; + // Export for testing + findIdx(table, val) { + return findIdx(table, val); + } +} // == Decoder ================================================================== -function DBCSDecoder(options, codec) { - // Decoder state - this.nodeIdx = 0; - this.prevBytes = []; +class DBCSDecoder { + constructor(options, codec, backend) { + this.backend = backend; - // Static data - this.decodeTables = codec.decodeTables; - this.decodeTableSeq = codec.decodeTableSeq; - this.defaultCharUnicode = codec.defaultCharUnicode; - this.gb18030 = codec.gb18030; -} + // Decoder state + this.nodeIdx = 0; + this.prevBytes = []; -DBCSDecoder.prototype.write = function (buf) { - const newBuf = Buffer.alloc(buf.length * 2), - prevBytes = this.prevBytes, - prevOffset = this.prevBytes.length; - - let nodeIdx = this.nodeIdx, - seqStart = -this.prevBytes.length, // idx of the start of current parsed sequence. - j = 0; - - for (let i = 0; i < buf.length; i++) { - const curByte = i >= 0 ? buf[i] : prevBytes[i + prevOffset]; - - // TODO: Check curByte is number 0 <= < 256 - - // Lookup in current trie node. - let uCode = this.decodeTables[nodeIdx][curByte]; - - if (uCode >= 0) { - // Normal character, just use it. - } else if (uCode === UNASSIGNED) { - // Unknown char. - // TODO: Callback with seq. - uCode = this.defaultCharUnicode.charCodeAt(0); - i = seqStart; // Skip one byte ('i' will be incremented by the for loop) and try to parse again. - } else if (uCode === GB18030_CODE) { - const b1 = i >= 3 ? buf[i - 3] : prevBytes[i - 3 + prevOffset]; - const b2 = i >= 2 ? buf[i - 2] : prevBytes[i - 2 + prevOffset]; - const b3 = i >= 1 ? buf[i - 1] : prevBytes[i - 1 + prevOffset]; - const ptr = - (b1 - 0x81) * 12600 + (b2 - 0x30) * 1260 + (b3 - 0x81) * 10 + (curByte - 0x30); - const idx = findIdx(this.gb18030.gbChars, ptr); - uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; - } else if (uCode <= NODE_START) { - // Go to next trie node. - nodeIdx = NODE_START - uCode; - continue; - } else if (uCode <= SEQ_START) { - // Output a sequence of chars. - const seq = this.decodeTableSeq[SEQ_START - uCode]; - for (let k = 0; k < seq.length - 1; k++) { - uCode = seq[k]; - newBuf[j++] = uCode & 0xff; - newBuf[j++] = uCode >> 8; - } - uCode = seq[seq.length - 1]; - } else - throw new Error( - `iconv-lite internal error: invalid decoding table value ${uCode} at ${nodeIdx}/${curByte}` - ); + // Static data + this.decodeTables = codec.decodeTables; + this.decodeTableSeq = codec.decodeTableSeq; + this.defaultCharUnicode = codec.defaultCharUnicode; + this.gb18030 = codec.gb18030; + } - // Write the character to buffer, handling higher planes using surrogate pair. - if (uCode >= 0x10000) { - uCode -= 0x10000; - const uCodeLead = 0xd800 | (uCode >> 10); - newBuf[j++] = uCodeLead & 0xff; - newBuf[j++] = uCodeLead >> 8; + write(buf) { + const chars = this.backend.allocRawChars(buf.length), + prevBytes = this.prevBytes, + prevOffset = this.prevBytes.length; + + let nodeIdx = this.nodeIdx, + seqStart = -this.prevBytes.length, // idx of the start of current parsed sequence. + charPos = 0; + + for (let i = 0; i < buf.length; i++) { + const curByte = i >= 0 ? buf[i] : prevBytes[i + prevOffset]; + + // TODO: Check curByte is number 0 <= < 256 + + // Lookup in current trie node. + let uCode = this.decodeTables[nodeIdx][curByte]; + + if (uCode >= 0) { + // Normal character, just use it. + } else if (uCode === UNASSIGNED) { + // Unknown char. + // TODO: Callback with seq. + uCode = this.defaultCharUnicode.charCodeAt(0); + i = seqStart; // Skip one byte ('i' will be incremented by the for loop) and try to parse again. + } else if (uCode === GB18030_CODE) { + const b1 = i >= 3 ? buf[i - 3] : prevBytes[i - 3 + prevOffset]; + const b2 = i >= 2 ? buf[i - 2] : prevBytes[i - 2 + prevOffset]; + const b3 = i >= 1 ? buf[i - 1] : prevBytes[i - 1 + prevOffset]; + const ptr = + (b1 - 0x81) * 12600 + (b2 - 0x30) * 1260 + (b3 - 0x81) * 10 + (curByte - 0x30); + const idx = findIdx(this.gb18030.gbChars, ptr); + uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; + } else if (uCode <= NODE_START) { + // Go to next trie node. + nodeIdx = NODE_START - uCode; + continue; + } else if (uCode <= SEQ_START) { + // Output a sequence of chars. + const seq = this.decodeTableSeq[SEQ_START - uCode]; + for (let k = 0; k < seq.length - 1; k++) { + uCode = seq[k]; + chars[charPos++] = uCode; + } + uCode = seq[seq.length - 1]; + } else + throw new Error( + `iconv-lite internal error: invalid decoding table value ${uCode} at ${nodeIdx}/${curByte}` + ); + + // Write the character to buffer, handling higher planes using surrogate pair. + if (uCode >= 0x10000) { + uCode -= 0x10000; + const uCodeLead = 0xd800 | (uCode >> 10); + chars[charPos++] = uCodeLead; + + uCode = 0xdc00 | (uCode & 0x3ff); + } + chars[charPos++] = uCode; - uCode = 0xdc00 | (uCode & 0x3ff); + // Reset trie node. + nodeIdx = 0; + seqStart = i + 1; } - newBuf[j++] = uCode & 0xff; - newBuf[j++] = uCode >> 8; - // Reset trie node. - nodeIdx = 0; - seqStart = i + 1; - } + this.nodeIdx = nodeIdx; + this.prevBytes = + seqStart >= 0 + ? Array.prototype.slice.call(buf, seqStart) + : prevBytes.slice(seqStart + prevOffset).concat(Array.prototype.slice.call(buf)); - this.nodeIdx = nodeIdx; - this.prevBytes = - seqStart >= 0 - ? Array.prototype.slice.call(buf, seqStart) - : prevBytes.slice(seqStart + prevOffset).concat(Array.prototype.slice.call(buf)); + return this.backend.rawCharsToResult(chars, charPos); + } - return newBuf.slice(0, j).toString("ucs2"); -}; + end() { + let ret = ""; -DBCSDecoder.prototype.end = function () { - let ret = ""; + // Try to parse all remaining chars. + while (this.prevBytes.length > 0) { + // Skip 1 character in the buffer. + ret += this.defaultCharUnicode; + const bytesArr = this.prevBytes.slice(1); - // Try to parse all remaining chars. - while (this.prevBytes.length > 0) { - // Skip 1 character in the buffer. - ret += this.defaultCharUnicode; - const bytesArr = this.prevBytes.slice(1); + // Parse remaining as usual. + this.prevBytes = []; + this.nodeIdx = 0; + if (bytesArr.length > 0) ret += this.write(bytesArr); + } - // Parse remaining as usual. this.prevBytes = []; this.nodeIdx = 0; - if (bytesArr.length > 0) ret += this.write(bytesArr); - } - this.prevBytes = []; - this.nodeIdx = 0; - return ret; -}; + return ret; + } +} // Binary search for GB18030. Returns largest i such that table[i] <= val. function findIdx(table, val) { diff --git a/test/gbkFile.txt b/generation/fixtures/gbkFile.txt similarity index 100% rename from test/gbkFile.txt rename to generation/fixtures/gbkFile.txt diff --git a/generation/gen-gbk-big5-fixtures.js b/generation/gen-gbk-big5-fixtures.js new file mode 100644 index 0000000..595c262 --- /dev/null +++ b/generation/gen-gbk-big5-fixtures.js @@ -0,0 +1,41 @@ +"use strict"; + +const Iconv = require("iconv").Iconv, + fs = require("fs"), + path = require("path"), + utils = require("../test/utils"); + +const fixtures = { + big5: big5(), + gbk: gbk(), +}; +const outputFile = path.resolve(__dirname, "..", "test", "fixtures", "gbk-big5.json"); +fs.writeFileSync(outputFile, JSON.stringify(fixtures)); + +function gbk() { + const inputFile = path.resolve(__dirname, "fixtures", "gbkFile.txt"); + const contentBuffer = fs.readFileSync(inputFile); + + const codec = Iconv("GBK", "utf8"); + const str = codec.convert(contentBuffer).toString(); + + return { + bytes: utils.hex(contentBuffer, true), + string: str, + }; +} + +function big5() { + const contentBuffer = Buffer.from( + "PEhUTUw+DQo8SEVBRD4gICAgDQoJPFRJVExFPiBtZXRhILzQxdKquqjPpc6hR6SkpOW69K22IDwvVElUTEU+DQoJPG1ldGEgSFRUUC1FUVVJVj0iQ29udGVudC1UeXBlIiBDT05URU5UPSJ0ZXh0L2h0bWw7IGNoYXJzZXQ9YmlnNSI+DQo8L0hFQUQ+DQo8Qk9EWT4NCg0Ks2+sT6RArdPBY8XppKSk5br0rbahSTxicj4NCihUaGlzIHBhZ2UgdXNlcyBiaWc1IGNoYXJhY3RlciBzZXQuKTxicj4NCmNoYXJzZXQ9YmlnNQ0KDQo8L0JPRFk+DQo8L0hUTUw+", + "base64" + ); + + const codec = Iconv("big5", "utf8"); + const str = codec.convert(contentBuffer).toString(); + + return { + bytes: utils.hex(contentBuffer, true), + string: str, + }; +} diff --git a/test/big5-test.js b/test/big5-test.js index c7a7a38..4c492f3 100644 --- a/test/big5-test.js +++ b/test/big5-test.js @@ -1,71 +1,68 @@ "use strict"; -var assert = require("assert"), - Buffer = require("safer-buffer").Buffer, - iconv = require("../"); +const assert = require("assert"), + utils = require("./utils"), + fixtures = require("./fixtures/gbk-big5.json"), + iconv = utils.requireIconv(); -var testString = "中文abc", //unicode contains Big5-code and ascii - testStringBig5Buffer = Buffer.from([0xa4, 0xa4, 0xa4, 0xe5, 0x61, 0x62, 0x63]), +const testString = "中文abc", //unicode contains Big5-code and ascii + testStringBig5Buffer = utils.bytes("a4 a4 a4 e5 61 62 63"), testString2 = "測試", - testStringBig5Buffer2 = Buffer.from([0xb4, 0xfa, 0xb8, 0xd5]); + testStringBig5Buffer2 = utils.bytes("b4 fa b8 d5"); -describe("Big5 tests", function () { +describe("Big5 tests #node-web", function () { it("Big5 correctly encoded/decoded", function () { assert.strictEqual( - iconv.encode(testString, "big5").toString("hex"), - testStringBig5Buffer.toString("hex") + utils.hex(iconv.encode(testString, "big5")), + utils.hex(testStringBig5Buffer) ); assert.strictEqual(iconv.decode(testStringBig5Buffer, "big5"), testString); assert.strictEqual( - iconv.encode(testString2, "big5").toString("hex"), - testStringBig5Buffer2.toString("hex") + utils.hex(iconv.encode(testString2, "big5")), + utils.hex(testStringBig5Buffer2) ); assert.strictEqual(iconv.decode(testStringBig5Buffer2, "big5"), testString2); }); it("cp950 correctly encoded/decoded", function () { assert.strictEqual( - iconv.encode(testString, "cp950").toString("hex"), - testStringBig5Buffer.toString("hex") + utils.hex(iconv.encode(testString, "cp950")), + utils.hex(testStringBig5Buffer) ); assert.strictEqual(iconv.decode(testStringBig5Buffer, "cp950"), testString); }); it("Big5 file read decoded,compare with iconv result", function () { - var contentBuffer = Buffer.from( - "PEhUTUw+DQo8SEVBRD4gICAgDQoJPFRJVExFPiBtZXRhILzQxdKquqjPpc6hR6SkpOW69K22IDwvVElUTEU+DQoJPG1ldGEgSFRUUC1FUVVJVj0iQ29udGVudC1UeXBlIiBDT05URU5UPSJ0ZXh0L2h0bWw7IGNoYXJzZXQ9YmlnNSI+DQo8L0hFQUQ+DQo8Qk9EWT4NCg0Ks2+sT6RArdPBY8XppKSk5br0rbahSTxicj4NCihUaGlzIHBhZ2UgdXNlcyBiaWc1IGNoYXJhY3RlciBzZXQuKTxicj4NCmNoYXJzZXQ9YmlnNQ0KDQo8L0JPRFk+DQo8L0hUTUw+", - "base64" - ); - var str = iconv.decode(contentBuffer, "big5"); - var iconvc = new (require("iconv").Iconv)("big5", "utf8"); - assert.strictEqual(iconvc.convert(contentBuffer).toString(), str); + const contentBuffer = utils.bytes(fixtures.big5.bytes); + const str = iconv.decode(contentBuffer, "big5"); + assert.strictEqual(fixtures.big5.string, str); }); it("Big5 correctly decodes and encodes characters · and ×", function () { // https://github.com/ashtuchkin/iconv-lite/issues/13 // Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT - var chars = "·×"; - var big5Chars = Buffer.from([0xa1, 0x50, 0xa1, 0xd1]); - assert.strictEqual(iconv.encode(chars, "big5").toString("hex"), big5Chars.toString("hex")); + const chars = "·×"; + const big5Chars = utils.bytes("a1 50 a1 d1"); + assert.strictEqual(utils.hex(iconv.encode(chars, "big5")), utils.hex(big5Chars)); assert.strictEqual(iconv.decode(big5Chars, "big5"), chars); }); it("Big5 correctly encodes & decodes sequences", function () { - assert.strictEqual(iconv.encode("\u00CA\u0304", "big5").toString("hex"), "8862"); - assert.strictEqual(iconv.encode("\u00EA\u030C", "big5").toString("hex"), "88a5"); - assert.strictEqual(iconv.encode("\u00CA", "big5").toString("hex"), "8866"); - assert.strictEqual(iconv.encode("\u00CA\u00CA", "big5").toString("hex"), "88668866"); + assert.strictEqual(utils.hex(iconv.encode("\u00CA\u0304", "big5")), "88 62"); + assert.strictEqual(utils.hex(iconv.encode("\u00EA\u030C", "big5")), "88 a5"); + assert.strictEqual(utils.hex(iconv.encode("\u00CA", "big5")), "88 66"); + assert.strictEqual(utils.hex(iconv.encode("\u00CA\u00CA", "big5")), "88 66 88 66"); - assert.strictEqual(iconv.encode("\u00CA\uD800", "big5").toString("hex"), "88663f"); // Unfinished surrogate. - assert.strictEqual(iconv.encode("\u00CA\uD841\uDD47", "big5").toString("hex"), "8866fa40"); // Finished surrogate ('𠕇'). - assert.strictEqual(iconv.encode("\u00CA𠕇", "big5").toString("hex"), "8866fa40"); // Finished surrogate ('𠕇'). + assert.strictEqual(utils.hex(iconv.encode("\u00CA\uD800", "big5")), "88 66 3f"); // Unfinished surrogate. + assert.strictEqual(utils.hex(iconv.encode("\u00CA\uD841\uDD47", "big5")), "88 66 fa 40"); // Finished surrogate ('𠕇'). + assert.strictEqual(utils.hex(iconv.encode("\u00CA𠕇", "big5")), "88 66 fa 40"); // Finished surrogate ('𠕇'). - assert.strictEqual(iconv.decode(Buffer.from("8862", "hex"), "big5"), "\u00CA\u0304"); - assert.strictEqual(iconv.decode(Buffer.from("8866", "hex"), "big5"), "\u00CA"); - assert.strictEqual(iconv.decode(Buffer.from("8866fa40", "hex"), "big5"), "\u00CA𠕇"); + assert.strictEqual(iconv.decode(utils.bytes("88 62"), "big5"), "\u00CA\u0304"); + assert.strictEqual(iconv.decode(utils.bytes("88 66"), "big5"), "\u00CA"); + assert.strictEqual(iconv.decode(utils.bytes("88 66 fa 40"), "big5"), "\u00CA𠕇"); }); it("Big5 correctly encodes 十", function () { - assert.strictEqual(iconv.encode("十", "big5").toString("hex"), "a451"); + assert.strictEqual(utils.hex(iconv.encode("十", "big5")), "a4 51"); }); }); diff --git a/test/fixtures/gbk-big5.json b/test/fixtures/gbk-big5.json new file mode 100644 index 0000000..2a00670 --- /dev/null +++ b/test/fixtures/gbk-big5.json @@ -0,0 +1,10 @@ +{ + "big5": { + "bytes": "3c 48 54 4d 4c 3e 0d 0a 3c 48 45 41 44 3e 20 20 20 20 0d 0a 09 3c 54 49 54 4c 45 3e 20 6d 65 74 61 20 bc d0 c5 d2 aa ba a8 cf a5 ce a1 47 a4 a4 a4 e5 ba f4 ad b6 20 3c 2f 54 49 54 4c 45 3e 0d 0a 09 3c 6d 65 74 61 20 48 54 54 50 2d 45 51 55 49 56 3d 22 43 6f 6e 74 65 6e 74 2d 54 79 70 65 22 20 43 4f 4e 54 45 4e 54 3d 22 74 65 78 74 2f 68 74 6d 6c 3b 20 63 68 61 72 73 65 74 3d 62 69 67 35 22 3e 0d 0a 3c 2f 48 45 41 44 3e 0d 0a 3c 42 4f 44 59 3e 0d 0a 0d 0a b3 6f ac 4f a4 40 ad d3 c1 63 c5 e9 a4 a4 a4 e5 ba f4 ad b6 a1 49 3c 62 72 3e 0d 0a 28 54 68 69 73 20 70 61 67 65 20 75 73 65 73 20 62 69 67 35 20 63 68 61 72 61 63 74 65 72 20 73 65 74 2e 29 3c 62 72 3e 0d 0a 63 68 61 72 73 65 74 3d 62 69 67 35 0d 0a 0d 0a 3c 2f 42 4f 44 59 3e 0d 0a 3c 2f 48 54 4d 4c 3e", + "string": "\r\n \r\n\t meta 標籤的使用:中文網頁 \r\n\t\r\n\r\n\r\n\r\n這是一個繁體中文網頁!
\r\n(This page uses big5 character set.)
\r\ncharset=big5\r\n\r\n\r\n" + }, + "gbk": { + "bytes": "3c 21 64 6f 63 74 79 70 65 20 68 74 6d 6c 3e 3c 68 74 6d 6c 3e 3c 68 65 61 64 3e 3c 6d 65 74 61 20 68 74 74 70 2d 65 71 75 69 76 3d 22 43 6f 6e 74 65 6e 74 2d 54 79 70 65 22 20 63 6f 6e 74 65 6e 74 3d 22 74 65 78 74 2f 68 74 6d 6c 3b 63 68 61 72 73 65 74 3d 67 62 32 33 31 32 22 3e 3c 74 69 74 6c 65 3e b0 d9 b6 c8 d2 bb cf c2 a3 ac c4 e3 be cd d6 aa b5 c0 20 20 20 20 20 20 3c 2f 74 69 74 6c 65 3e 3c 73 74 79 6c 65 3e 68 74 6d 6c 7b 6f 76 65 72 66 6c 6f 77 2d 79 3a 61 75 74 6f 7d 62 6f 64 79 7b 66 6f 6e 74 3a 31 32 70 78 20 61 72 69 61 6c 3b 74 65 78 74 2d 61 6c 69 67 6e 3a 63 65 6e 74 65 72 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 23 66 66 66 7d 62 6f 64 79 2c 70 2c 66 6f 72 6d 2c 75 6c 2c 6c 69 7b 6d 61 72 67 69 6e 3a 30 3b 70 61 64 64 69 6e 67 3a 30 3b 6c 69 73 74 2d 73 74 79 6c 65 3a 6e 6f 6e 65 7d 62 6f 64 79 2c 66 6f 72 6d 2c 23 66 6d 7b 70 6f 73 69 74 69 6f 6e 3a 72 65 6c 61 74 69 76 65 7d 74 64 7b 74 65 78 74 2d 61 6c 69 67 6e 3a 6c 65 66 74 7d 69 6d 67 7b 62 6f 72 64 65 72 3a 30 7d 61 7b 63 6f 6c 6f 72 3a 23 30 30 63 7d 61 3a 61 63 74 69 76 65 7b 63 6f 6c 6f 72 3a 23 66 36 30 7d 23 75 7b 70 61 64 64 69 6e 67 3a 37 70 78 20 31 30 70 78 20 33 70 78 20 30 3b 74 65 78 74 2d 61 6c 69 67 6e 3a 72 69 67 68 74 7d 23 6d 7b 77 69 64 74 68 3a 36 38 30 70 78 3b 6d 61 72 67 69 6e 3a 30 20 61 75 74 6f 7d 23 6e 76 7b 66 6f 6e 74 2d 73 69 7a 65 3a 31 36 70 78 3b 6d 61 72 67 69 6e 3a 30 20 30 20 34 70 78 3b 74 65 78 74 2d 61 6c 69 67 6e 3a 6c 65 66 74 3b 74 65 78 74 2d 69 6e 64 65 6e 74 3a 31 31 37 70 78 7d 23 6e 76 20 61 2c 23 6e 76 20 62 2c 2e 62 74 6e 2c 23 6c 6b 7b 66 6f 6e 74 2d 73 69 7a 65 3a 31 34 70 78 7d 23 66 6d 7b 70 61 64 64 69 6e 67 2d 6c 65 66 74 3a 39 30 70 78 3b 74 65 78 74 2d 61 6c 69 67 6e 3a 6c 65 66 74 7d 23 6b 77 7b 77 69 64 74 68 3a 34 30 34 70 78 3b 68 65 69 67 68 74 3a 32 32 70 78 3b 70 61 64 64 69 6e 67 3a 34 70 78 20 37 70 78 3b 70 61 64 64 69 6e 67 3a 36 70 78 20 37 70 78 20 32 70 78 5c 39 3b 66 6f 6e 74 3a 31 36 70 78 20 61 72 69 61 6c 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 75 72 6c 28 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 69 2d 31 2e 30 2e 30 2e 70 6e 67 29 20 6e 6f 2d 72 65 70 65 61 74 20 2d 33 30 34 70 78 20 30 3b 5f 62 61 63 6b 67 72 6f 75 6e 64 2d 61 74 74 61 63 68 6d 65 6e 74 3a 66 69 78 65 64 3b 62 6f 72 64 65 72 3a 31 70 78 20 73 6f 6c 69 64 20 23 63 64 63 64 63 64 3b 62 6f 72 64 65 72 2d 63 6f 6c 6f 72 3a 23 39 61 39 61 39 61 20 23 63 64 63 64 63 64 20 23 63 64 63 64 63 64 20 23 39 61 39 61 39 61 3b 76 65 72 74 69 63 61 6c 2d 61 6c 69 67 6e 3a 74 6f 70 7d 2e 62 74 6e 7b 77 69 64 74 68 3a 39 35 70 78 3b 68 65 69 67 68 74 3a 33 32 70 78 3b 70 61 64 64 69 6e 67 3a 30 3b 70 61 64 64 69 6e 67 2d 74 6f 70 3a 32 70 78 5c 39 3b 62 6f 72 64 65 72 3a 30 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 23 64 64 64 20 75 72 6c 28 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 69 2d 31 2e 30 2e 30 2e 70 6e 67 29 20 6e 6f 2d 72 65 70 65 61 74 3b 63 75 72 73 6f 72 3a 70 6f 69 6e 74 65 72 7d 2e 62 74 6e 5f 68 7b 62 61 63 6b 67 72 6f 75 6e 64 2d 70 6f 73 69 74 69 6f 6e 3a 2d 31 30 30 70 78 20 30 7d 23 6b 77 2c 2e 62 74 6e 5f 77 72 7b 6d 61 72 67 69 6e 3a 30 20 35 70 78 20 30 20 30 7d 2e 62 74 6e 5f 77 72 7b 77 69 64 74 68 3a 39 37 70 78 3b 68 65 69 67 68 74 3a 33 34 70 78 3b 64 69 73 70 6c 61 79 3a 69 6e 6c 69 6e 65 2d 62 6c 6f 63 6b 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 75 72 6c 28 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 69 2d 31 2e 30 2e 30 2e 70 6e 67 29 20 6e 6f 2d 72 65 70 65 61 74 20 2d 32 30 32 70 78 20 30 3b 5f 74 6f 70 3a 31 70 78 3b 2a 70 6f 73 69 74 69 6f 6e 3a 72 65 6c 61 74 69 76 65 7d 23 6c 6b 7b 6d 61 72 67 69 6e 3a 33 33 70 78 20 30 7d 23 6c 6b 20 73 70 61 6e 7b 66 6f 6e 74 3a 31 34 70 78 20 22 cb ce cc e5 22 7d 23 6c 6d 7b 68 65 69 67 68 74 3a 36 30 70 78 7d 23 6c 68 7b 6d 61 72 67 69 6e 3a 31 36 70 78 20 30 20 35 70 78 3b 77 6f 72 64 2d 73 70 61 63 69 6e 67 3a 33 70 78 7d 2e 74 6f 6f 6c 73 7b 70 6f 73 69 74 69 6f 6e 3a 61 62 73 6f 6c 75 74 65 3b 74 6f 70 3a 2d 34 70 78 3b 2a 74 6f 70 3a 31 30 70 78 3b 72 69 67 68 74 3a 2d 31 33 70 78 3b 7d 23 6d 48 6f 6c 64 65 72 7b 77 69 64 74 68 3a 36 32 70 78 3b 70 6f 73 69 74 69 6f 6e 3a 72 65 6c 61 74 69 76 65 3b 7a 2d 69 6e 64 65 78 3a 32 39 36 3b 64 69 73 70 6c 61 79 3a 6e 6f 6e 65 7d 23 6d 43 6f 6e 7b 68 65 69 67 68 74 3a 31 38 70 78 3b 6c 69 6e 65 2d 68 65 69 67 68 74 3a 31 38 70 78 3b 70 6f 73 69 74 69 6f 6e 3a 61 62 73 6f 6c 75 74 65 3b 63 75 72 73 6f 72 3a 70 6f 69 6e 74 65 72 3b 70 61 64 64 69 6e 67 3a 30 20 31 38 70 78 20 30 20 30 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 75 72 6c 28 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 62 67 2d 31 2e 30 2e 30 2e 67 69 66 29 20 6e 6f 2d 72 65 70 65 61 74 20 72 69 67 68 74 20 2d 31 33 34 70 78 3b 62 61 63 6b 67 72 6f 75 6e 64 2d 70 6f 73 69 74 69 6f 6e 3a 72 69 67 68 74 20 2d 31 33 36 70 78 5c 39 7d 23 6d 43 6f 6e 20 73 70 61 6e 7b 63 6f 6c 6f 72 3a 23 30 30 63 3b 63 75 72 73 6f 72 3a 64 65 66 61 75 6c 74 3b 64 69 73 70 6c 61 79 3a 62 6c 6f 63 6b 7d 23 6d 43 6f 6e 20 2e 68 77 7b 74 65 78 74 2d 64 65 63 6f 72 61 74 69 6f 6e 3a 75 6e 64 65 72 6c 69 6e 65 3b 63 75 72 73 6f 72 3a 70 6f 69 6e 74 65 72 7d 23 6d 4d 65 6e 75 7b 77 69 64 74 68 3a 35 36 70 78 3b 62 6f 72 64 65 72 3a 31 70 78 20 73 6f 6c 69 64 20 23 39 61 39 39 66 66 3b 6c 69 73 74 2d 73 74 79 6c 65 3a 6e 6f 6e 65 3b 70 6f 73 69 74 69 6f 6e 3a 61 62 73 6f 6c 75 74 65 3b 72 69 67 68 74 3a 37 70 78 3b 74 6f 70 3a 32 38 70 78 3b 64 69 73 70 6c 61 79 3a 6e 6f 6e 65 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 23 66 66 66 7d 23 6d 4d 65 6e 75 20 61 7b 77 69 64 74 68 3a 31 30 30 25 3b 68 65 69 67 68 74 3a 31 30 30 25 3b 64 69 73 70 6c 61 79 3a 62 6c 6f 63 6b 3b 6c 69 6e 65 2d 68 65 69 67 68 74 3a 32 32 70 78 3b 74 65 78 74 2d 69 6e 64 65 6e 74 3a 36 70 78 3b 74 65 78 74 2d 64 65 63 6f 72 61 74 69 6f 6e 3a 6e 6f 6e 65 7d 23 6d 4d 65 6e 75 20 61 3a 68 6f 76 65 72 7b 62 61 63 6b 67 72 6f 75 6e 64 3a 23 64 39 65 31 66 36 7d 23 6d 4d 65 6e 75 20 2e 6c 6e 7b 68 65 69 67 68 74 3a 31 70 78 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 23 63 63 66 3b 6f 76 65 72 66 6c 6f 77 3a 68 69 64 64 65 6e 3b 6d 61 72 67 69 6e 3a 32 70 78 3b 66 6f 6e 74 2d 73 69 7a 65 3a 31 70 78 3b 6c 69 6e 65 2d 68 65 69 67 68 74 3a 31 70 78 7d 23 63 70 2c 23 63 70 20 61 7b 63 6f 6c 6f 72 3a 23 37 37 63 7d 23 73 65 74 68 7b 64 69 73 70 6c 61 79 3a 6e 6f 6e 65 3b 62 65 68 61 76 69 6f 72 3a 75 72 6c 28 23 64 65 66 61 75 6c 74 23 68 6f 6d 65 70 61 67 65 29 7d 23 73 65 74 66 7b 64 69 73 70 6c 61 79 3a 6e 6f 6e 65 7d 3c 2f 73 74 79 6c 65 3e 0d 0a 3c 2f 68 65 61 64 3e 0d 0a 0d 0a 3c 62 6f 64 79 3e 3c 64 69 76 20 69 64 3d 22 75 22 3e 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 67 61 6f 6a 69 2f 70 72 65 66 65 72 65 6e 63 65 73 2e 68 74 6d 6c 22 20 6e 61 6d 65 3d 22 74 6a 5f 73 65 74 74 69 6e 67 22 3e cb d1 cb f7 c9 e8 d6 c3 3c 2f 61 3e 26 6e 62 73 70 3b 7c 26 6e 62 73 70 3b 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 70 61 73 73 70 6f 72 74 2e 62 61 69 64 75 2e 63 6f 6d 2f 3f 6c 6f 67 69 6e 26 74 70 6c 3d 6d 6e 22 20 6e 61 6d 65 3d 22 74 6a 5f 6c 6f 67 69 6e 22 3e b5 c7 c2 bc 3c 2f 61 3e 3c 2f 64 69 76 3e 0d 0a 3c 64 69 76 20 69 64 3d 22 6d 22 3e 3c 70 20 69 64 3d 22 6c 67 22 3e 3c 69 6d 67 20 73 72 63 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 62 61 69 64 75 5f 73 79 6c 6f 67 6f 31 2e 67 69 66 22 20 77 69 64 74 68 3d 22 32 37 30 22 20 68 65 69 67 68 74 3d 22 31 32 39 22 20 75 73 65 6d 61 70 3d 22 23 6d 70 22 3e 3c 6d 61 70 20 6e 61 6d 65 3d 22 6d 70 22 3e 3c 61 72 65 61 20 73 68 61 70 65 3d 22 72 65 63 74 22 20 63 6f 6f 72 64 73 3d 22 34 30 2c 32 35 2c 32 33 30 2c 39 35 22 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 68 69 2e 62 61 69 64 75 2e 63 6f 6d 2f 62 61 69 64 75 2f 22 20 74 61 72 67 65 74 3d 22 5f 62 6c 61 6e 6b 22 20 74 69 74 6c 65 3d 22 b5 e3 b4 cb bd f8 c8 eb 20 b0 d9 b6 c8 b5 c4 bf d5 bc e4 22 20 3e 3c 2f 6d 61 70 3e 3c 2f 70 3e 3c 70 20 69 64 3d 22 6e 76 22 3e 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 6e 65 77 73 2e 62 61 69 64 75 2e 63 6f 6d 22 3e d0 c2 26 6e 62 73 70 3b ce c5 3c 2f 61 3e a1 a1 3c 62 3e cd f8 26 6e 62 73 70 3b d2 b3 3c 2f 62 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 74 69 65 62 61 2e 62 61 69 64 75 2e 63 6f 6d 22 3e cc f9 26 6e 62 73 70 3b b0 c9 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 7a 68 69 64 61 6f 2e 62 61 69 64 75 2e 63 6f 6d 22 3e d6 aa 26 6e 62 73 70 3b b5 c0 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 6d 70 33 2e 62 61 69 64 75 2e 63 6f 6d 22 3e 4d 50 33 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 69 6d 61 67 65 2e 62 61 69 64 75 2e 63 6f 6d 22 3e cd bc 26 6e 62 73 70 3b c6 ac 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 76 69 64 65 6f 2e 62 61 69 64 75 2e 63 6f 6d 22 3e ca d3 26 6e 62 73 70 3b c6 b5 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 6d 61 70 2e 62 61 69 64 75 2e 63 6f 6d 22 3e b5 d8 26 6e 62 73 70 3b cd bc 3c 2f 61 3e 3c 2f 70 3e 3c 64 69 76 20 69 64 3d 22 66 6d 22 3e 3c 66 6f 72 6d 20 6e 61 6d 65 3d 22 66 22 20 61 63 74 69 6f 6e 3d 22 2f 73 22 3e 3c 69 6e 70 75 74 20 74 79 70 65 3d 22 74 65 78 74 22 20 6e 61 6d 65 3d 22 77 64 22 20 69 64 3d 22 6b 77 22 20 6d 61 78 6c 65 6e 67 74 68 3d 22 31 30 30 22 3e 3c 69 6e 70 75 74 20 74 79 70 65 3d 22 68 69 64 64 65 6e 22 20 6e 61 6d 65 3d 22 72 73 76 5f 62 70 22 20 76 61 6c 75 65 3d 22 30 22 3e 3c 69 6e 70 75 74 20 74 79 70 65 3d 22 68 69 64 64 65 6e 22 20 6e 61 6d 65 3d 22 72 73 76 5f 73 70 74 22 20 76 61 6c 75 65 3d 22 33 22 3e 3c 73 70 61 6e 20 63 6c 61 73 73 3d 22 62 74 6e 5f 77 72 22 3e 3c 69 6e 70 75 74 20 74 79 70 65 3d 22 73 75 62 6d 69 74 22 20 76 61 6c 75 65 3d 22 b0 d9 b6 c8 d2 bb cf c2 22 20 69 64 3d 22 73 75 22 20 63 6c 61 73 73 3d 22 62 74 6e 22 20 6f 6e 6d 6f 75 73 65 64 6f 77 6e 3d 22 74 68 69 73 2e 63 6c 61 73 73 4e 61 6d 65 3d 27 62 74 6e 20 62 74 6e 5f 68 27 22 20 6f 6e 6d 6f 75 73 65 6f 75 74 3d 22 74 68 69 73 2e 63 6c 61 73 73 4e 61 6d 65 3d 27 62 74 6e 27 22 3e 3c 2f 73 70 61 6e 3e 3c 2f 66 6f 72 6d 3e 3c 73 70 61 6e 20 63 6c 61 73 73 3d 22 74 6f 6f 6c 73 22 3e 3c 73 70 61 6e 20 69 64 3d 22 6d 48 6f 6c 64 65 72 22 3e 3c 64 69 76 20 69 64 3d 22 6d 43 6f 6e 22 3e 3c 73 70 61 6e 3e ca e4 c8 eb b7 a8 3c 2f 73 70 61 6e 3e 3c 2f 64 69 76 3e 3c 2f 73 70 61 6e 3e 3c 2f 73 70 61 6e 3e 3c 75 6c 20 69 64 3d 22 6d 4d 65 6e 75 22 3e 3c 6c 69 3e 3c 61 20 68 72 65 66 3d 22 23 22 20 6e 61 6d 65 3d 22 69 6d 65 5f 68 77 22 3e ca d6 d0 b4 3c 2f 61 3e 3c 2f 6c 69 3e 3c 6c 69 3e 3c 61 20 68 72 65 66 3d 22 23 22 20 6e 61 6d 65 3d 22 69 6d 65 5f 70 79 22 3e c6 b4 d2 f4 3c 2f 61 3e 3c 2f 6c 69 3e 3c 6c 69 20 63 6c 61 73 73 3d 22 6c 6e 22 3e 3c 2f 6c 69 3e 3c 6c 69 3e 3c 61 20 68 72 65 66 3d 22 23 22 20 6e 61 6d 65 3d 22 69 6d 65 5f 63 6c 22 3e b9 d8 b1 d5 3c 2f 61 3e 3c 2f 6c 69 3e 3c 2f 75 6c 3e 3c 2f 64 69 76 3e 0d 0a 3c 70 20 69 64 3d 22 6c 6b 22 3e 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 68 69 2e 62 61 69 64 75 2e 63 6f 6d 22 3e bf d5 bc e4 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 62 61 69 6b 65 2e 62 61 69 64 75 2e 63 6f 6d 22 3e b0 d9 bf c6 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 68 61 6f 31 32 33 2e 63 6f 6d 22 3e 68 61 6f 31 32 33 3c 2f 61 3e 3c 73 70 61 6e 3e 20 7c 20 3c 61 20 68 72 65 66 3d 22 2f 6d 6f 72 65 2f 22 3e b8 fc b6 e0 26 67 74 3b 26 67 74 3b 3c 2f 61 3e 3c 2f 73 70 61 6e 3e 3c 2f 70 3e 3c 70 20 69 64 3d 22 6c 6d 22 3e 3c 2f 70 3e 3c 70 3e 3c 61 20 69 64 3d 22 73 65 74 68 22 20 6f 6e 43 6c 69 63 6b 3d 22 74 68 69 73 2e 73 65 74 48 6f 6d 65 50 61 67 65 28 27 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 27 29 22 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 75 74 69 6c 69 74 79 2e 62 61 69 64 75 2e 63 6f 6d 2f 74 72 61 66 2f 63 6c 69 63 6b 2e 70 68 70 3f 69 64 3d 32 31 35 26 75 72 6c 3d 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 22 20 6f 6e 6d 6f 75 73 65 64 6f 77 6e 3d 22 72 65 74 75 72 6e 20 6e 73 5f 63 28 7b 27 66 6d 27 3a 27 62 65 68 73 27 2c 27 74 61 62 27 3a 27 68 6f 6d 65 70 61 67 65 27 2c 27 70 6f 73 27 3a 30 7d 29 22 3e b0 d1 b0 d9 b6 c8 c9 e8 ce aa d6 f7 d2 b3 3c 2f 61 3e 3c 61 20 69 64 3d 22 73 65 74 66 22 20 6f 6e 43 6c 69 63 6b 3d 22 66 61 28 74 68 69 73 29 22 20 68 72 65 66 3d 22 6a 61 76 61 73 63 72 69 70 74 3a 76 6f 69 64 28 30 29 22 20 6f 6e 6d 6f 75 73 65 64 6f 77 6e 3d 22 72 65 74 75 72 6e 20 6e 73 5f 63 28 7b 27 66 6d 27 3a 27 62 65 68 73 27 2c 27 74 61 62 27 3a 27 66 61 76 6f 72 69 74 65 73 27 2c 27 70 6f 73 27 3a 30 7d 29 22 3e b0 d1 b0 d9 b6 c8 bc d3 c8 eb ca d5 b2 d8 bc d0 3c 2f 61 3e 3c 2f 70 3e 0d 0a 3c 70 20 69 64 3d 22 6c 68 22 3e 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 65 2e 62 61 69 64 75 2e 63 6f 6d 2f 3f 72 65 66 65 72 3d 38 38 38 22 3e bc d3 c8 eb b0 d9 b6 c8 cd c6 b9 e3 3c 2f 61 3e 20 7c 20 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 74 6f 70 2e 62 61 69 64 75 2e 63 6f 6d 22 3e cb d1 cb f7 b7 e7 d4 c6 b0 f1 3c 2f 61 3e 20 7c 20 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 68 6f 6d 65 2e 62 61 69 64 75 2e 63 6f 6d 22 3e b9 d8 d3 da b0 d9 b6 c8 3c 2f 61 3e 20 7c 20 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 69 72 2e 62 61 69 64 75 2e 63 6f 6d 22 3e 41 62 6f 75 74 20 42 61 69 64 75 3c 2f 61 3e 3c 2f 70 3e 3c 70 20 69 64 3d 22 63 70 22 3e 26 63 6f 70 79 3b 32 30 31 31 20 42 61 69 64 75 20 3c 61 20 68 72 65 66 3d 22 2f 64 75 74 79 2f 22 3e ca b9 d3 c3 b0 d9 b6 c8 c7 b0 b1 d8 b6 c1 3c 2f 61 3e 20 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 6d 69 69 62 65 69 61 6e 2e 67 6f 76 2e 63 6e 22 20 74 61 72 67 65 74 3d 22 5f 62 6c 61 6e 6b 22 3e be a9 49 43 50 d6 a4 30 33 30 31 37 33 ba c5 3c 2f 61 3e 20 3c 69 6d 67 20 73 72 63 3d 22 68 74 74 70 3a 2f 2f 67 69 6d 67 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 67 73 2e 67 69 66 22 3e 3c 2f 70 3e 3c 2f 64 69 76 3e 3c 2f 62 6f 64 79 3e 0d 0a 0d 0a 3c 73 63 72 69 70 74 3e 76 61 72 20 77 3d 77 69 6e 64 6f 77 2c 64 3d 64 6f 63 75 6d 65 6e 74 2c 6e 3d 6e 61 76 69 67 61 74 6f 72 2c 6b 3d 64 2e 66 2e 77 64 2c 61 3d 64 2e 67 65 74 45 6c 65 6d 65 6e 74 42 79 49 64 28 22 6e 76 22 29 2e 67 65 74 45 6c 65 6d 65 6e 74 73 42 79 54 61 67 4e 61 6d 65 28 22 61 22 29 2c 69 73 49 45 3d 6e 2e 75 73 65 72 41 67 65 6e 74 2e 69 6e 64 65 78 4f 66 28 22 4d 53 49 45 22 29 21 3d 2d 31 26 26 21 77 69 6e 64 6f 77 2e 6f 70 65 72 61 3b 66 6f 72 28 76 61 72 20 69 3d 30 3b 69 3c 61 2e 6c 65 6e 67 74 68 3b 69 2b 2b 29 7b 61 5b 69 5d 2e 6f 6e 63 6c 69 63 6b 3d 66 75 6e 63 74 69 6f 6e 28 29 7b 69 66 28 6b 2e 76 61 6c 75 65 2e 6c 65 6e 67 74 68 3e 30 29 7b 76 61 72 20 43 3d 74 68 69 73 2c 41 3d 43 2e 68 72 65 66 2c 42 3d 65 6e 63 6f 64 65 55 52 49 43 6f 6d 70 6f 6e 65 6e 74 28 6b 2e 76 61 6c 75 65 29 3b 69 66 28 41 2e 69 6e 64 65 78 4f 66 28 22 71 3d 22 29 21 3d 2d 31 29 7b 43 2e 68 72 65 66 3d 41 2e 72 65 70 6c 61 63 65 28 2f 71 3d 5b 5e 26 5c 78 32 34 5d 2a 2f 2c 22 71 3d 22 2b 42 29 7d 65 6c 73 65 7b 74 68 69 73 2e 68 72 65 66 2b 3d 22 3f 71 3d 22 2b 42 7d 7d 7d 7d 28 66 75 6e 63 74 69 6f 6e 28 29 7b 69 66 28 2f 71 3d 28 5b 5e 26 5d 2b 29 2f 2e 74 65 73 74 28 6c 6f 63 61 74 69 6f 6e 2e 73 65 61 72 63 68 29 29 7b 6b 2e 76 61 6c 75 65 3d 64 65 63 6f 64 65 55 52 49 43 6f 6d 70 6f 6e 65 6e 74 28 52 65 67 45 78 70 5b 22 5c 78 32 34 31 22 5d 29 7d 7d 29 28 29 3b 69 66 28 6e 2e 63 6f 6f 6b 69 65 45 6e 61 62 6c 65 64 26 26 21 2f 73 75 67 3f 3d 30 2f 2e 74 65 73 74 28 64 2e 63 6f 6f 6b 69 65 29 29 7b 64 2e 77 72 69 74 65 28 22 3c 73 63 72 69 70 74 20 73 72 63 3d 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 6a 73 2f 62 64 73 75 67 2e 6a 73 3f 76 3d 31 2e 30 2e 33 2e 30 3e 3c 5c 2f 73 63 72 69 70 74 3e 22 29 7d 66 75 6e 63 74 69 6f 6e 20 61 64 64 45 56 28 43 2c 42 2c 41 29 7b 69 66 28 77 2e 61 74 74 61 63 68 45 76 65 6e 74 29 7b 43 2e 61 74 74 61 63 68 45 76 65 6e 74 28 22 6f 6e 22 2b 42 2c 41 29 7d 65 6c 73 65 7b 69 66 28 77 2e 61 64 64 45 76 65 6e 74 4c 69 73 74 65 6e 65 72 29 7b 43 2e 61 64 64 45 76 65 6e 74 4c 69 73 74 65 6e 65 72 28 42 2c 41 2c 66 61 6c 73 65 29 7d 7d 7d 66 75 6e 63 74 69 6f 6e 20 47 28 41 29 7b 72 65 74 75 72 6e 20 64 2e 67 65 74 45 6c 65 6d 65 6e 74 42 79 49 64 28 41 29 7d 66 75 6e 63 74 69 6f 6e 20 6e 73 5f 63 28 45 29 7b 76 61 72 20 46 3d 65 6e 63 6f 64 65 55 52 49 43 6f 6d 70 6f 6e 65 6e 74 28 77 69 6e 64 6f 77 2e 64 6f 63 75 6d 65 6e 74 2e 6c 6f 63 61 74 69 6f 6e 2e 68 72 65 66 29 2c 44 3d 22 22 2c 41 3d 22 22 2c 42 3d 22 22 2c 43 3d 77 69 6e 64 6f 77 5b 22 42 44 5f 50 53 5f 43 22 2b 28 6e 65 77 20 44 61 74 65 28 29 29 2e 67 65 74 54 69 6d 65 28 29 5d 3d 6e 65 77 20 49 6d 61 67 65 28 29 3b 66 6f 72 28 76 20 69 6e 20 45 29 7b 41 3d 45 5b 76 5d 3b 44 2b 3d 76 2b 22 3d 22 2b 41 2b 22 26 22 7d 42 3d 22 26 6d 75 3d 22 2b 46 3b 43 2e 73 72 63 3d 22 68 74 74 70 3a 2f 2f 6e 73 63 6c 69 63 6b 2e 62 61 69 64 75 2e 63 6f 6d 2f 76 2e 67 69 66 3f 70 69 64 3d 32 30 31 26 70 6a 3d 77 77 77 26 22 2b 44 2b 22 70 61 74 68 3d 22 2b 46 2b 22 26 74 3d 22 2b 6e 65 77 20 44 61 74 65 28 29 2e 67 65 74 54 69 6d 65 28 29 3b 72 65 74 75 72 6e 20 74 72 75 65 7d 69 66 28 2f 5c 62 62 64 69 6d 65 3d 5b 31 32 5d 2f 2e 74 65 73 74 28 64 2e 63 6f 6f 6b 69 65 29 29 7b 64 6f 63 75 6d 65 6e 74 2e 77 72 69 74 65 28 22 3c 73 63 72 69 70 74 20 73 72 63 3d 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 63 61 63 68 65 2f 69 6d 65 2f 6a 73 2f 6f 70 65 6e 69 6d 65 2d 31 2e 30 2e 30 2e 6a 73 3e 3c 5c 2f 73 63 72 69 70 74 3e 22 29 7d 28 66 75 6e 63 74 69 6f 6e 28 29 7b 76 61 72 20 42 3d 47 28 22 75 73 65 72 22 29 2c 41 3d 47 28 22 75 73 65 72 4d 65 6e 75 22 29 3b 69 66 28 42 26 26 41 29 7b 61 64 64 45 56 28 42 2c 22 63 6c 69 63 6b 22 2c 66 75 6e 63 74 69 6f 6e 28 43 29 7b 41 2e 73 74 79 6c 65 2e 64 69 73 70 6c 61 79 3d 41 2e 73 74 79 6c 65 2e 64 69 73 70 6c 61 79 3d 3d 22 62 6c 6f 63 6b 22 3f 22 6e 6f 6e 65 22 3a 22 62 6c 6f 63 6b 22 3b 77 69 6e 64 6f 77 2e 65 76 65 6e 74 3f 43 2e 63 61 6e 63 65 6c 42 75 62 62 6c 65 3d 74 72 75 65 3a 43 2e 73 74 6f 70 50 72 6f 70 61 67 61 74 69 6f 6e 28 29 7d 29 3b 61 64 64 45 56 28 64 6f 63 75 6d 65 6e 74 2c 22 63 6c 69 63 6b 22 2c 66 75 6e 63 74 69 6f 6e 28 29 7b 41 2e 73 74 79 6c 65 2e 64 69 73 70 6c 61 79 3d 22 6e 6f 6e 65 22 7d 29 7d 7d 29 28 29 3b 28 66 75 6e 63 74 69 6f 6e 28 29 7b 76 61 72 20 45 3d 47 28 22 75 22 29 2e 67 65 74 45 6c 65 6d 65 6e 74 73 42 79 54 61 67 4e 61 6d 65 28 22 61 22 29 2c 43 3d 47 28 22 6e 76 22 29 2e 67 65 74 45 6c 65 6d 65 6e 74 73 42 79 54 61 67 4e 61 6d 65 28 22 61 22 29 2c 49 3d 47 28 22 6c 6b 22 29 2e 67 65 74 45 6c 65 6d 65 6e 74 73 42 79 54 61 67 4e 61 6d 65 28 22 61 22 29 2c 42 3d 22 22 3b 76 61 72 20 41 3d 5b 22 6e 65 77 73 22 2c 22 74 69 65 62 61 22 2c 22 7a 68 69 64 61 6f 22 2c 22 6d 70 33 22 2c 22 69 6d 67 22 2c 22 76 69 64 65 6f 22 2c 22 6d 61 70 22 5d 3b 76 61 72 20 48 3d 5b 22 68 69 22 2c 22 62 61 69 6b 65 22 2c 22 68 61 6f 31 32 33 22 2c 22 6d 6f 72 65 22 5d 3b 69 66 28 47 28 22 75 6e 22 29 26 26 47 28 22 75 6e 22 29 2e 69 6e 6e 65 72 48 54 4d 4c 21 3d 22 22 29 7b 42 3d 47 28 22 75 6e 22 29 2e 69 6e 6e 65 72 48 54 4d 4c 7d 66 75 6e 63 74 69 6f 6e 20 44 28 4a 29 7b 61 64 64 45 56 28 4a 2c 22 6d 6f 75 73 65 64 6f 77 6e 22 2c 66 75 6e 63 74 69 6f 6e 28 4c 29 7b 76 61 72 20 4c 3d 4c 7c 7c 77 69 6e 64 6f 77 2e 65 76 65 6e 74 3b 76 61 72 20 4b 3d 4c 2e 74 61 72 67 65 74 7c 7c 4c 2e 73 72 63 45 6c 65 6d 65 6e 74 3b 6e 73 5f 63 28 7b 66 6d 3a 22 62 65 68 73 22 2c 74 61 62 3a 4b 2e 6e 61 6d 65 7c 7c 22 74 6a 5f 75 73 65 72 22 2c 75 6e 3a 65 6e 63 6f 64 65 55 52 49 43 6f 6d 70 6f 6e 65 6e 74 28 42 29 7d 29 7d 29 7d 66 6f 72 28 76 61 72 20 46 3d 30 3b 46 3c 45 2e 6c 65 6e 67 74 68 3b 46 2b 2b 29 7b 44 28 45 5b 46 5d 29 7d 66 6f 72 28 76 61 72 20 46 3d 30 3b 46 3c 43 2e 6c 65 6e 67 74 68 3b 46 2b 2b 29 7b 43 5b 46 5d 2e 6e 61 6d 65 3d 22 74 6a 5f 22 2b 41 5b 46 5d 3b 44 28 43 5b 46 5d 29 7d 66 6f 72 28 76 61 72 20 46 3d 30 3b 46 3c 49 2e 6c 65 6e 67 74 68 3b 46 2b 2b 29 7b 49 5b 46 5d 2e 6e 61 6d 65 3d 22 74 6a 5f 22 2b 48 5b 46 5d 3b 44 28 49 5b 46 5d 29 7d 7d 29 28 29 3b 61 64 64 45 56 28 77 2c 22 6c 6f 61 64 22 2c 66 75 6e 63 74 69 6f 6e 28 29 7b 6b 2e 66 6f 63 75 73 28 29 7d 29 3b 77 2e 6f 6e 75 6e 6c 6f 61 64 3d 66 75 6e 63 74 69 6f 6e 28 29 7b 7d 3b 3c 2f 73 63 72 69 70 74 3e 0d 0a 0d 0a 0d 0a 3c 73 63 72 69 70 74 20 74 79 70 65 3d 22 74 65 78 74 2f 6a 61 76 61 73 63 72 69 70 74 22 20 73 72 63 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 63 61 63 68 65 2f 68 70 73 2f 6a 73 2f 68 70 73 2d 31 2e 32 2e 6a 73 22 3e 3c 2f 73 63 72 69 70 74 3e 0d 0a 0d 0a 3c 2f 68 74 6d 6c 3e 3c 21 2d 2d 62 37 36 32 33 34 35 64 39 37 39 35 36 32 65 38 2d 2d 3e", + "string": "百度一下,你就知道 \r\n\r\n\r\n
搜索设置 | 登录
\r\n

新 闻 网 页 贴 吧 知 道 MP3 图 片 视 频 地 图

\r\n

空间 百科 hao123 | 更多>>

把百度设为主页把百度加入收藏夹

\r\n

加入百度推广 | 搜索风云榜 | 关于百度 | About Baidu

©2011 Baidu 使用百度前必读 京ICP证030173号

\r\n\r\n\r\n\r\n\r\n\r\n\r\n" + } +} diff --git a/test/gbk-test.js b/test/gbk-test.js index 75c3a90..b74033d 100644 --- a/test/gbk-test.js +++ b/test/gbk-test.js @@ -1,46 +1,42 @@ "use strict"; -var fs = require("fs"), - assert = require("assert"), - Buffer = require("safer-buffer").Buffer, - iconv = require("../"); +const assert = require("assert"), + utils = require("./utils"), + fixtures = require("./fixtures/gbk-big5.json"), + iconv = utils.requireIconv(); -var testString = "中国abc", //unicode contains GBK-code and ascii - testStringGBKBuffer = Buffer.from([0xd6, 0xd0, 0xb9, 0xfa, 0x61, 0x62, 0x63]); +const testString = "中国abc", //unicode contains GBK-code and ascii + testStringGBKBuffer = utils.bytes("d6 d0 b9 fa 61 62 63"); -describe("GBK tests", function () { +describe("GBK tests #node-web", function () { it("GBK correctly encoded/decoded", function () { assert.strictEqual( - iconv.encode(testString, "GBK").toString("binary"), - testStringGBKBuffer.toString("binary") + utils.hex(iconv.encode(testString, "GBK")), + utils.hex(testStringGBKBuffer) ); assert.strictEqual(iconv.decode(testStringGBKBuffer, "GBK"), testString); }); it("GB2312 correctly encoded/decoded", function () { assert.strictEqual( - iconv.encode(testString, "GB2312").toString("binary"), - testStringGBKBuffer.toString("binary") + utils.hex(iconv.encode(testString, "GB2312")), + utils.hex(testStringGBKBuffer) ); assert.strictEqual(iconv.decode(testStringGBKBuffer, "GB2312"), testString); }); it("GBK file read decoded,compare with iconv result", function () { - var contentBuffer = fs.readFileSync(__dirname + "/gbkFile.txt"); - var str = iconv.decode(contentBuffer, "GBK"); - var iconvc = new (require("iconv").Iconv)("GBK", "utf8"); - assert.strictEqual(iconvc.convert(contentBuffer).toString(), str); + const contentBuffer = utils.bytes(fixtures.gbk.bytes); + const str = iconv.decode(contentBuffer, "GBK"); + assert.strictEqual(fixtures.gbk.string, str); }); it("GBK correctly decodes and encodes characters · and ×", function () { // https://github.com/ashtuchkin/iconv-lite/issues/13 // Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT - var chars = "·×"; - var gbkChars = Buffer.from([0xa1, 0xa4, 0xa1, 0xc1]); - assert.strictEqual( - iconv.encode(chars, "GBK").toString("binary"), - gbkChars.toString("binary") - ); + const chars = "·×"; + const gbkChars = utils.bytes("a1 a4 a1 c1"); + assert.strictEqual(utils.hex(iconv.encode(chars, "GBK")), utils.hex(gbkChars)); assert.strictEqual(iconv.decode(gbkChars, "GBK"), chars); }); @@ -48,8 +44,8 @@ describe("GBK tests", function () { // Euro character (U+20AC) has two encodings in GBK family: 0x80 and 0xA2 0xE3 // According to W3C's technical recommendation (https://www.w3.org/TR/encoding/#gbk-encoder), // Both GBK and GB18030 decoders should accept both encodings. - var gbkEuroEncoding1 = Buffer.from([0x80]), - gbkEuroEncoding2 = Buffer.from([0xa2, 0xe3]), + const gbkEuroEncoding1 = utils.bytes("80"), + gbkEuroEncoding2 = utils.bytes("a2 e3"), strEuro = "€"; assert.strictEqual(iconv.decode(gbkEuroEncoding1, "GBK"), strEuro); @@ -58,13 +54,10 @@ describe("GBK tests", function () { assert.strictEqual(iconv.decode(gbkEuroEncoding2, "GB18030"), strEuro); // But when decoding, GBK should produce 0x80, but GB18030 - 0xA2 0xE3. + assert.strictEqual(utils.hex(iconv.encode(strEuro, "GBK")), utils.hex(gbkEuroEncoding1)); assert.strictEqual( - iconv.encode(strEuro, "GBK").toString("hex"), - gbkEuroEncoding1.toString("hex") - ); - assert.strictEqual( - iconv.encode(strEuro, "GB18030").toString("hex"), - gbkEuroEncoding2.toString("hex") + utils.hex(iconv.encode(strEuro, "GB18030")), + utils.hex(gbkEuroEncoding2) ); }); @@ -92,65 +85,54 @@ describe("GBK tests", function () { ); }); - function swapBytes(buf) { - for (var i = 0; i < buf.length; i += 2) buf.writeUInt16LE(buf.readUInt16BE(i), i); - return buf; - } - function spacify4(str) { - return str.replace(/(....)/g, "$1 ").trim(); - } - function strToHex(str) { - return spacify4(swapBytes(Buffer.from(str, "ucs2")).toString("hex")); - } - it("GB18030 encodes/decodes 4 byte sequences", function () { - var chars = { - "\u0080": Buffer.from([0x81, 0x30, 0x81, 0x30]), - "\u0081": Buffer.from([0x81, 0x30, 0x81, 0x31]), - "\u008b": Buffer.from([0x81, 0x30, 0x82, 0x31]), - "\u0615": Buffer.from([0x81, 0x31, 0x82, 0x31]), - 㦟: Buffer.from([0x82, 0x31, 0x82, 0x31]), - "\udbd9\ude77": Buffer.from([0xe0, 0x31, 0x82, 0x31]), + const chars = { + "\u0080": utils.bytes("81 30 81 30"), + "\u0081": utils.bytes("81 30 81 31"), + "\u008b": utils.bytes("81 30 82 31"), + "\u0615": utils.bytes("81 31 82 31"), + 㦟: utils.bytes("82 31 82 31"), + "\udbd9\ude77": utils.bytes("e0 31 82 31"), }; - for (var uChar in chars) { - var gbkBuf = chars[uChar]; + for (const uChar in chars) { + const gbkBuf = chars[uChar]; + assert.strictEqual(utils.hex(iconv.encode(uChar, "GB18030")), utils.hex(gbkBuf)); assert.strictEqual( - iconv.encode(uChar, "GB18030").toString("hex"), - gbkBuf.toString("hex") + utils.strToHex(iconv.decode(gbkBuf, "GB18030")), + utils.strToHex(uChar) ); - assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar)); } }); it("GB18030 correctly decodes incomplete 4 byte sequences", function () { - var chars = { - "�": Buffer.from([0x82]), - "�1": Buffer.from([0x82, 0x31]), - "�1�": Buffer.from([0x82, 0x31, 0x82]), - 㦟: Buffer.from([0x82, 0x31, 0x82, 0x31]), - "� ": Buffer.from([0x82, 0x20]), - "�1 ": Buffer.from([0x82, 0x31, 0x20]), - "�1� ": Buffer.from([0x82, 0x31, 0x82, 0x20]), - "\u399f ": Buffer.from([0x82, 0x31, 0x82, 0x31, 0x20]), - "�1\u4fdb": Buffer.from([0x82, 0x31, 0x82, 0x61]), - "�1\u5010\u0061": Buffer.from([0x82, 0x31, 0x82, 0x82, 0x61]), - 㦟俛: Buffer.from([0x82, 0x31, 0x82, 0x31, 0x82, 0x61]), - "�1\u50101�1": Buffer.from([0x82, 0x31, 0x82, 0x82, 0x31, 0x82, 0x31]), + const chars = { + "�": utils.bytes("82"), + "�1": utils.bytes("82 31"), + "�1�": utils.bytes("82 31 82"), + 㦟: utils.bytes("82 31 82 31"), + "� ": utils.bytes("82 20"), + "�1 ": utils.bytes("82 31 20"), + "�1� ": utils.bytes("82 31 82 20"), + "\u399f ": utils.bytes("82 31 82 31 20"), + "�1\u4fdb": utils.bytes("82 31 82 61"), + "�1\u5010\u0061": utils.bytes("82 31 82 82 61"), + 㦟俛: utils.bytes("82 31 82 31 82 61"), + "�1\u50101�1": utils.bytes("82 31 82 82 31 82 31"), }; - for (var uChar in chars) { - var gbkBuf = chars[uChar]; - assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar)); + for (const uChar in chars) { + const gbkBuf = chars[uChar]; + assert.strictEqual( + utils.strToHex(iconv.decode(gbkBuf, "GB18030")), + utils.strToHex(uChar) + ); } }); it("GB18030:2005 changes are applied", function () { // See https://github.com/whatwg/encoding/issues/22 - var chars = "\u1E3F\u0000\uE7C7"; // Use \u0000 as separator - var gbkChars = Buffer.from([0xa8, 0xbc, 0x00, 0x81, 0x35, 0xf4, 0x37]); + const chars = "\u1E3F\u0000\uE7C7"; // Use \u0000 as separator + const gbkChars = utils.bytes("a8 bc 00 81 35 f4 37"); assert.strictEqual(iconv.decode(gbkChars, "GB18030"), chars); - assert.strictEqual( - iconv.encode(chars, "GB18030").toString("hex"), - gbkChars.toString("hex") - ); + assert.strictEqual(utils.hex(iconv.encode(chars, "GB18030")), utils.hex(gbkChars)); }); }); diff --git a/test/shiftjis-test.js b/test/shiftjis-test.js index e7a7fbb..52d9ca4 100644 --- a/test/shiftjis-test.js +++ b/test/shiftjis-test.js @@ -1,45 +1,45 @@ "use strict"; -var assert = require("assert"), - Buffer = require("safer-buffer").Buffer, - iconv = require("../"); +const assert = require("assert"), + utils = require("./utils"), + iconv = utils.requireIconv(); -describe("ShiftJIS tests", function () { +describe("ShiftJIS tests #node-web", function () { it("ShiftJIS correctly encoded/decoded", function () { - var testString = "中文abc", //unicode contains ShiftJIS-code and ascii - testStringBig5Buffer = Buffer.from([0x92, 0x86, 0x95, 0xb6, 0x61, 0x62, 0x63]), + const testString = "中文abc", //unicode contains ShiftJIS-code and ascii + testStringBig5Buffer = utils.bytes("92 86 95 b6 61 62 63"), testString2 = "測試", - testStringBig5Buffer2 = Buffer.from([0x91, 0xaa, 0x8e, 0x8e]); + testStringBig5Buffer2 = utils.bytes("91 aa 8e 8e"); assert.strictEqual( - iconv.encode(testString, "shiftjis").toString("hex"), - testStringBig5Buffer.toString("hex") + utils.hex(iconv.encode(testString, "shiftjis")), + utils.hex(testStringBig5Buffer) ); assert.strictEqual(iconv.decode(testStringBig5Buffer, "shiftjis"), testString); assert.strictEqual( - iconv.encode(testString2, "shiftjis").toString("hex"), - testStringBig5Buffer2.toString("hex") + utils.hex(iconv.encode(testString2, "shiftjis")), + utils.hex(testStringBig5Buffer2) ); assert.strictEqual(iconv.decode(testStringBig5Buffer2, "shiftjis"), testString2); }); it("ShiftJIS extended chars are decoded, but not encoded", function () { - var buf = Buffer.from("ed40eefceeef", "hex"), + const buf = utils.bytes("ed 40 ee fc ee ef"), str = "纊"ⅰ", - res = "fa5cfa57fa40", // repeated block (these same chars are repeated in the different place) - buf2 = Buffer.from("f040f2fcf940", "hex"), + res = "fa 5c fa 57 fa 40", // repeated block (these same chars are repeated in the different place) + buf2 = utils.bytes("f0 40 f2 fc f9 40"), str2 = "", - res2 = "3f3f3f"; // non-repeated, UA block. + res2 = "3f 3f 3f"; // non-repeated, UA block. assert.strictEqual(iconv.decode(buf, "shiftjis"), str); assert.strictEqual(iconv.decode(buf2, "shiftjis"), str2); - assert.strictEqual(iconv.encode(str, "shiftjis").toString("hex"), res); - assert.strictEqual(iconv.encode(str2, "shiftjis").toString("hex"), res2); + assert.strictEqual(utils.hex(iconv.encode(str, "shiftjis")), res); + assert.strictEqual(utils.hex(iconv.encode(str2, "shiftjis")), res2); }); it("ShiftJIS includes extensions", function () { - assert.strictEqual(iconv.decode(Buffer.from("8740", "hex"), "shiftjis"), "①"); - assert.strictEqual(iconv.encode("①", "shiftjis").toString("hex"), "8740"); + assert.strictEqual(iconv.decode(utils.bytes("87 40"), "shiftjis"), "①"); + assert.strictEqual(utils.hex(iconv.encode("①", "shiftjis")), "87 40"); }); }); diff --git a/test/webpack/iconv-lite-tests.js b/test/webpack/iconv-lite-tests.js index c2690be..9b1ad0f 100644 --- a/test/webpack/iconv-lite-tests.js +++ b/test/webpack/iconv-lite-tests.js @@ -11,3 +11,6 @@ require("../sbcs-test"); require("../turkish-test"); require("../utf16-test"); require("../utils-test"); +require("../shiftjis-test"); +require("../gbk-test"); +require("../big5-test");