From ed88711c10346207449881c261ed1a4287067470 Mon Sep 17 00:00:00 2001 From: Alexander Shtuchkin Date: Sun, 23 May 2021 18:32:46 -0400 Subject: [PATCH] Fix Big5-HKSCS encoding to prefer non-HKSCS codes in case of multiple options (fixes #264) --- encodings/dbcs-data.js | 14 +++++++++++++- generation/gen-dbcs.js | 32 ++++++++++++++++++++++++++++++-- test/big5-test.js | 4 ++++ 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/encodings/dbcs-data.js b/encodings/dbcs-data.js index 4b61914..0d17e58 100644 --- a/encodings/dbcs-data.js +++ b/encodings/dbcs-data.js @@ -167,7 +167,19 @@ module.exports = { 'big5hkscs': { type: '_dbcs', table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) }, - encodeSkipVals: [0xa2cc], + encodeSkipVals: [ + // Although Encoding Standard says we should avoid encoding to HKSCS area (See Step 1 of + // https://encoding.spec.whatwg.org/#index-big5-pointer), we still do it to increase compatibility with ICU. + // But if a single unicode point can be encoded both as HKSCS and regular Big5, we prefer the latter. + 0x8e69, 0x8e6f, 0x8e7e, 0x8eab, 0x8eb4, 0x8ecd, 0x8ed0, 0x8f57, 0x8f69, 0x8f6e, 0x8fcb, 0x8ffe, + 0x906d, 0x907a, 0x90c4, 0x90dc, 0x90f1, 0x91bf, 0x92af, 0x92b0, 0x92b1, 0x92b2, 0x92d1, 0x9447, 0x94ca, + 0x95d9, 0x96fc, 0x9975, 0x9b76, 0x9b78, 0x9b7b, 0x9bc6, 0x9bde, 0x9bec, 0x9bf6, 0x9c42, 0x9c53, 0x9c62, + 0x9c68, 0x9c6b, 0x9c77, 0x9cbc, 0x9cbd, 0x9cd0, 0x9d57, 0x9d5a, 0x9dc4, 0x9def, 0x9dfb, 0x9ea9, 0x9eef, + 0x9efd, 0x9f60, 0x9fcb, 0xa077, 0xa0dc, 0xa0df, 0x8fcc, 0x92c8, 0x9644, 0x96ed, + + // Step 2 of https://encoding.spec.whatwg.org/#index-big5-pointer: Use last pointer for U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345 + 0xa2a4, 0xa2a5, 0xa2a7, 0xa2a6, 0xa2cc, 0xa2ce, + ], }, 'cnbig5': 'big5hkscs', diff --git a/generation/gen-dbcs.js b/generation/gen-dbcs.js index 6c877ed..a3d550d 100644 --- a/generation/gen-dbcs.js +++ b/generation/gen-dbcs.js @@ -27,7 +27,7 @@ async.parallel({ } // Calculate difference between big5 and cp950, and write it to a file. - // See http://encoding.spec.whatwg.org/#big5-encoder + // See http://encoding.spec.whatwg.org/#big5 var big5add = {} for (var i = 0x8100; i < 0x10000; i++) { // Lead byte is 0x81 .. 0xFE var trail = i & 0xFF; @@ -41,7 +41,35 @@ async.parallel({ big5add[i] = big5Char; } - // Add char sequences that are not in the index file (as given in http://encoding.spec.whatwg.org/#big5-encoder) + // Calculate HKSCS codes that are duplicates of big5 codes and need to be skipped when encoding. + console.log("Duplicate HKSCS codes that need to be skipped when encoded (see encodeSkipVals in big5hkscs): ") + var big5codes = {}; + for (var i = 0xA100; i < 0x10000; i++) { + var uCharCode = (big5add[i] !== undefined) ? big5add[i] : data.cp950[i]; + if (uCharCode !== undefined) { + big5codes[uCharCode] = true; + } + } + for (var i = 0x8100; i < 0xA100; i++) { + var uCharCode = (big5add[i] !== undefined) ? big5add[i] : data.cp950[i]; + if (uCharCode !== undefined && big5codes[uCharCode]) { + console.log("0x"+i.toString(16)); + } + } + + if (big5Char !== undefined) { + if (lead < 0xA1) { + if (d[big5Char] !== undefined) { + console.log("duplicate in first: "+ pointer + " char " + big5Char); + } + d[big5Char] = i; + } else if (d[big5Char] !== undefined) { + console.log("dup 0x"+d[big5Char].toString(16) + " -> " + i.toString(16)) + } + + } + + // Add char sequences that are not in the index file (as given in http://encoding.spec.whatwg.org/#big5-decoder) function toIdx(pointer) { var trail = pointer % 157; var lead = Math.floor(pointer / 157) + 0x81; return (lead << 8) + (trail + (trail < 0x3F ? 0x40 : 0x62))} big5add[toIdx(1133)] = [0x00CA, 0x0304]; big5add[toIdx(1135)] = [0x00CA, 0x030C]; diff --git a/test/big5-test.js b/test/big5-test.js index 839a3f6..1ca9dd9 100644 --- a/test/big5-test.js +++ b/test/big5-test.js @@ -54,4 +54,8 @@ describe("Big5 tests", function() { it("Big5 correctly encodes 十", function() { assert.strictEqual(iconv.encode("十", "big5").toString('hex'), "a451"); }); + + it("Big5 correctly encodes 起 (issue #264)", function() { + assert.strictEqual(iconv.encode("起", "big5").toString('hex'), "b05f"); + }); });