From abd43ea30519163f77b4fa5d464ae8212f57c486 Mon Sep 17 00:00:00 2001
From: Khafra <maitken033380023@gmail.com>
Date: Sat, 7 Sep 2024 04:22:03 -0400
Subject: [PATCH] esm: use Undici/`fetch` `data:` URL parser

Fixes: https://github.com/nodejs/node/issues/53775
PR-URL: https://github.com/nodejs/node/pull/54748
Reviewed-By: Matteo Collina <matteo.collina@gmail.com>
Reviewed-By: Antoine du Hamel <duhamelantoine1995@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
---
 lib/internal/data_url.js             | 352 +++++++++++++++++++++++++++
 lib/internal/modules/esm/load.js     |  21 +-
 test/es-module/test-esm-data-urls.js |  11 +-
 test/parallel/test-data-url.js       |  30 +++
 4 files changed, 397 insertions(+), 17 deletions(-)
 create mode 100644 lib/internal/data_url.js
 create mode 100644 test/parallel/test-data-url.js

diff --git a/lib/internal/data_url.js b/lib/internal/data_url.js
new file mode 100644
index 00000000000000..772498449319dc
--- /dev/null
+++ b/lib/internal/data_url.js
@@ -0,0 +1,352 @@
+'use strict';
+
+const {
+  RegExpPrototypeExec,
+  RegExpPrototypeSymbolReplace,
+  StringFromCharCodeApply,
+  StringPrototypeCharCodeAt,
+  StringPrototypeIndexOf,
+  StringPrototypeSlice,
+  TypedArrayPrototypeSubarray,
+  Uint8Array,
+} = primordials;
+
+const assert = require('internal/assert');
+const { Buffer } = require('buffer');
+const { MIMEType } = require('internal/mime');
+
+let encoder;
+function lazyEncoder() {
+  if (encoder === undefined) {
+    const { TextEncoder } = require('internal/encoding');
+    encoder = new TextEncoder();
+  }
+
+  return encoder;
+}
+
+const ASCII_WHITESPACE_REPLACE_REGEX = /[\u0009\u000A\u000C\u000D\u0020]/g // eslint-disable-line
+
+// https://fetch.spec.whatwg.org/#data-url-processor
+/** @param {URL} dataURL */
+function dataURLProcessor(dataURL) {
+  // 1. Assert: dataURL's scheme is "data".
+  assert(dataURL.protocol === 'data:');
+
+  // 2. Let input be the result of running the URL
+  // serializer on dataURL with exclude fragment
+  // set to true.
+  let input = URLSerializer(dataURL, true);
+
+  // 3. Remove the leading "data:" string from input.
+  input = StringPrototypeSlice(input, 5);
+
+  // 4. Let position point at the start of input.
+  const position = { position: 0 };
+
+  // 5. Let mimeType be the result of collecting a
+  // sequence of code points that are not equal
+  // to U+002C (,), given position.
+  let mimeType = collectASequenceOfCodePointsFast(
+    ',',
+    input,
+    position,
+  );
+
+  // 6. Strip leading and trailing ASCII whitespace
+  // from mimeType.
+  // Undici implementation note: we need to store the
+  // length because if the mimetype has spaces removed,
+  // the wrong amount will be sliced from the input in
+  // step #9
+  const mimeTypeLength = mimeType.length;
+  mimeType = removeASCIIWhitespace(mimeType, true, true);
+
+  // 7. If position is past the end of input, then
+  // return failure
+  if (position.position >= input.length) {
+    return 'failure';
+  }
+
+  // 8. Advance position by 1.
+  position.position++;
+
+  // 9. Let encodedBody be the remainder of input.
+  const encodedBody = StringPrototypeSlice(input, mimeTypeLength + 1);
+
+  // 10. Let body be the percent-decoding of encodedBody.
+  let body = stringPercentDecode(encodedBody);
+
+  // 11. If mimeType ends with U+003B (;), followed by
+  // zero or more U+0020 SPACE, followed by an ASCII
+  // case-insensitive match for "base64", then:
+  if (RegExpPrototypeExec(/;(\u0020){0,}base64$/i, mimeType) !== null) {
+    // 1. Let stringBody be the isomorphic decode of body.
+    const stringBody = isomorphicDecode(body);
+
+    // 2. Set body to the forgiving-base64 decode of
+    // stringBody.
+    body = forgivingBase64(stringBody);
+
+    // 3. If body is failure, then return failure.
+    if (body === 'failure') {
+      return 'failure';
+    }
+
+    // 4. Remove the last 6 code points from mimeType.
+    mimeType = StringPrototypeSlice(mimeType, 0, -6);
+
+    // 5. Remove trailing U+0020 SPACE code points from mimeType,
+    // if any.
+    mimeType = RegExpPrototypeSymbolReplace(/(\u0020)+$/, mimeType, '');
+
+    // 6. Remove the last U+003B (;) code point from mimeType.
+    mimeType = StringPrototypeSlice(mimeType, 0, -1);
+  }
+
+  // 12. If mimeType starts with U+003B (;), then prepend
+  // "text/plain" to mimeType.
+  if (mimeType[0] === ';') {
+    mimeType = 'text/plain' + mimeType;
+  }
+
+  // 13. Let mimeTypeRecord be the result of parsing
+  // mimeType.
+  // 14. If mimeTypeRecord is failure, then set
+  // mimeTypeRecord to text/plain;charset=US-ASCII.
+  let mimeTypeRecord;
+
+  try {
+    mimeTypeRecord = new MIMEType(mimeType);
+  } catch {
+    mimeTypeRecord = new MIMEType('text/plain;charset=US-ASCII');
+  }
+
+  // 15. Return a new data: URL struct whose MIME
+  // type is mimeTypeRecord and body is body.
+  // https://fetch.spec.whatwg.org/#data-url-struct
+  return { mimeType: mimeTypeRecord, body };
+}
+
+// https://url.spec.whatwg.org/#concept-url-serializer
+/**
+ * @param {URL} url
+ * @param {boolean} excludeFragment
+ */
+function URLSerializer(url, excludeFragment = false) {
+  const { href } = url;
+
+  if (!excludeFragment) {
+    return href;
+  }
+
+  const hashLength = url.hash.length;
+  const serialized = hashLength === 0 ? href : StringPrototypeSlice(href, 0, href.length - hashLength);
+
+  if (!hashLength && href[href.length - 1] === '#') {
+    return StringPrototypeSlice(serialized, 0, -1);
+  }
+
+  return serialized;
+}
+
+/**
+ * A faster collectASequenceOfCodePoints that only works when comparing a single character.
+ * @param {string} char
+ * @param {string} input
+ * @param {{ position: number }} position
+ */
+function collectASequenceOfCodePointsFast(char, input, position) {
+  const idx = StringPrototypeIndexOf(input, char, position.position);
+  const start = position.position;
+
+  if (idx === -1) {
+    position.position = input.length;
+    return StringPrototypeSlice(input, start);
+  }
+
+  position.position = idx;
+  return StringPrototypeSlice(input, start, position.position);
+}
+
+// https://url.spec.whatwg.org/#string-percent-decode
+/** @param {string} input */
+function stringPercentDecode(input) {
+  // 1. Let bytes be the UTF-8 encoding of input.
+  const bytes = lazyEncoder().encode(input);
+
+  // 2. Return the percent-decoding of bytes.
+  return percentDecode(bytes);
+}
+
+/**
+ * @param {number} byte
+ */
+function isHexCharByte(byte) {
+  // 0-9 A-F a-f
+  return (byte >= 0x30 && byte <= 0x39) || (byte >= 0x41 && byte <= 0x46) || (byte >= 0x61 && byte <= 0x66);
+}
+
+/**
+ * @param {number} byte
+ */
+function hexByteToNumber(byte) {
+  return (
+    // 0-9
+    byte >= 0x30 && byte <= 0x39 ?
+      (byte - 48) :
+    // Convert to uppercase
+    // ((byte & 0xDF) - 65) + 10
+      ((byte & 0xDF) - 55)
+  );
+}
+
+// https://url.spec.whatwg.org/#percent-decode
+/** @param {Uint8Array} input */
+function percentDecode(input) {
+  const length = input.length;
+  // 1. Let output be an empty byte sequence.
+  /** @type {Uint8Array} */
+  const output = new Uint8Array(length);
+  let j = 0;
+  // 2. For each byte byte in input:
+  for (let i = 0; i < length; ++i) {
+    const byte = input[i];
+
+    // 1. If byte is not 0x25 (%), then append byte to output.
+    if (byte !== 0x25) {
+      output[j++] = byte;
+
+    // 2. Otherwise, if byte is 0x25 (%) and the next two bytes
+    // after byte in input are not in the ranges
+    // 0x30 (0) to 0x39 (9), 0x41 (A) to 0x46 (F),
+    // and 0x61 (a) to 0x66 (f), all inclusive, append byte
+    // to output.
+    } else if (
+      byte === 0x25 &&
+      !(isHexCharByte(input[i + 1]) && isHexCharByte(input[i + 2]))
+    ) {
+      output[j++] = 0x25;
+
+    // 3. Otherwise:
+    } else {
+      // 1. Let bytePoint be the two bytes after byte in input,
+      // decoded, and then interpreted as hexadecimal number.
+      // 2. Append a byte whose value is bytePoint to output.
+      output[j++] = (hexByteToNumber(input[i + 1]) << 4) | hexByteToNumber(input[i + 2]);
+
+      // 3. Skip the next two bytes in input.
+      i += 2;
+    }
+  }
+
+  // 3. Return output.
+  return length === j ? output : TypedArrayPrototypeSubarray(output, 0, j);
+}
+
+// https://infra.spec.whatwg.org/#forgiving-base64-decode
+/** @param {string} data */
+function forgivingBase64(data) {
+  // 1. Remove all ASCII whitespace from data.
+  data = RegExpPrototypeSymbolReplace(ASCII_WHITESPACE_REPLACE_REGEX, data, '');
+
+  let dataLength = data.length;
+  // 2. If data's code point length divides by 4 leaving
+  // no remainder, then:
+  if (dataLength % 4 === 0) {
+    // 1. If data ends with one or two U+003D (=) code points,
+    // then remove them from data.
+    if (data[dataLength - 1] === '=') {
+      --dataLength;
+      if (data[dataLength - 1] === '=') {
+        --dataLength;
+      }
+    }
+  }
+
+  // 3. If data's code point length divides by 4 leaving
+  // a remainder of 1, then return failure.
+  if (dataLength % 4 === 1) {
+    return 'failure';
+  }
+
+  // 4. If data contains a code point that is not one of
+  //  U+002B (+)
+  //  U+002F (/)
+  //  ASCII alphanumeric
+  // then return failure.
+  if (RegExpPrototypeExec(/[^+/0-9A-Za-z]/, data.length === dataLength ? data : StringPrototypeSlice(data, 0, dataLength)) !== null) {
+    return 'failure';
+  }
+
+  const buffer = Buffer.from(data, 'base64');
+  return new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength);
+}
+
+/**
+ * @see https://infra.spec.whatwg.org/#ascii-whitespace
+ * @param {number} char
+ */
+function isASCIIWhitespace(char) {
+  // "\r\n\t\f "
+  return char === 0x00d || char === 0x00a || char === 0x009 || char === 0x00c || char === 0x020;
+}
+
+/**
+ * @see https://infra.spec.whatwg.org/#strip-leading-and-trailing-ascii-whitespace
+ * @param {string} str
+ * @param {boolean} [leading=true]
+ * @param {boolean} [trailing=true]
+ */
+function removeASCIIWhitespace(str, leading = true, trailing = true) {
+  return removeChars(str, leading, trailing, isASCIIWhitespace);
+}
+
+/**
+ * @param {string} str
+ * @param {boolean} leading
+ * @param {boolean} trailing
+ * @param {(charCode: number) => boolean} predicate
+ */
+function removeChars(str, leading, trailing, predicate) {
+  let lead = 0;
+  let trail = str.length - 1;
+
+  if (leading) {
+    while (lead < str.length && predicate(StringPrototypeCharCodeAt(str, lead))) lead++;
+  }
+
+  if (trailing) {
+    while (trail > 0 && predicate(StringPrototypeCharCodeAt(str, trail))) trail--;
+  }
+
+  return lead === 0 && trail === str.length - 1 ? str : StringPrototypeSlice(str, lead, trail + 1);
+}
+
+/**
+ * @see https://infra.spec.whatwg.org/#isomorphic-decode
+ * @param {Uint8Array} input
+ * @returns {string}
+ */
+function isomorphicDecode(input) {
+  // 1. To isomorphic decode a byte sequence input, return a string whose code point
+  //    length is equal to input's length and whose code points have the same values
+  //    as the values of input's bytes, in the same order.
+  const length = input.length;
+  if ((2 << 15) - 1 > length) {
+    return StringFromCharCodeApply(input);
+  }
+  let result = ''; let i = 0;
+  let addition = (2 << 15) - 1;
+  while (i < length) {
+    if (i + addition > length) {
+      addition = length - i;
+    }
+    result += StringFromCharCodeApply(TypedArrayPrototypeSubarray(input, i, i += addition));
+  }
+  return result;
+}
+
+module.exports = {
+  dataURLProcessor,
+};
diff --git a/lib/internal/modules/esm/load.js b/lib/internal/modules/esm/load.js
index 7b77af35a1dfeb..5df0355c701da6 100644
--- a/lib/internal/modules/esm/load.js
+++ b/lib/internal/modules/esm/load.js
@@ -3,7 +3,6 @@
 const {
   ArrayPrototypePush,
   RegExpPrototypeExec,
-  decodeURIComponent,
 } = primordials;
 const { kEmptyObject } = require('internal/util');
 
@@ -30,7 +29,9 @@ const {
   ERR_UNSUPPORTED_ESM_URL_SCHEME,
 } = require('internal/errors').codes;
 
-const DATA_URL_PATTERN = /^[^/]+\/[^,;]+(?:[^,]*?)(;base64)?,([\s\S]*)$/;
+const {
+  dataURLProcessor,
+} = require('internal/data_url');
 
 /**
  * @param {URL} url URL to the module
@@ -45,12 +46,11 @@ async function getSource(url, context) {
     const { readFile: readFileAsync } = require('internal/fs/promises').exports;
     source = await readFileAsync(url);
   } else if (protocol === 'data:') {
-    const match = RegExpPrototypeExec(DATA_URL_PATTERN, url.pathname);
-    if (!match) {
-      throw new ERR_INVALID_URL(responseURL);
+    const result = dataURLProcessor(url);
+    if (result === 'failure') {
+      throw new ERR_INVALID_URL(responseURL, null);
     }
-    const { 1: base64, 2: body } = match;
-    source = BufferFrom(decodeURIComponent(body), base64 ? 'base64' : 'utf8');
+    source = BufferFrom(result.body);
   } else if (experimentalNetworkImports && (
     protocol === 'https:' ||
     protocol === 'http:'
@@ -84,12 +84,11 @@ function getSourceSync(url, context) {
   if (protocol === 'file:') {
     source = readFileSync(url);
   } else if (protocol === 'data:') {
-    const match = RegExpPrototypeExec(DATA_URL_PATTERN, url.pathname);
-    if (!match) {
+    const result = dataURLProcessor(url);
+    if (result === 'failure') {
       throw new ERR_INVALID_URL(responseURL);
     }
-    const { 1: base64, 2: body } = match;
-    source = BufferFrom(decodeURIComponent(body), base64 ? 'base64' : 'utf8');
+    source = BufferFrom(result.body);
   } else {
     const supportedSchemes = ['file', 'data'];
     throw new ERR_UNSUPPORTED_ESM_URL_SCHEME(url, supportedSchemes);
diff --git a/test/es-module/test-esm-data-urls.js b/test/es-module/test-esm-data-urls.js
index 0817cf179df340..21f4e94930b1cc 100644
--- a/test/es-module/test-esm-data-urls.js
+++ b/test/es-module/test-esm-data-urls.js
@@ -96,12 +96,7 @@ function createBase64URL(mime, body) {
   {
     const body = 'null';
     const plainESMURL = createURL('invalid', body);
-    try {
-      await import(plainESMURL);
-      common.mustNotCall()();
-    } catch (e) {
-      assert.strictEqual(e.code, 'ERR_INVALID_URL');
-    }
+    await assert.rejects(import(plainESMURL), { code: 'ERR_UNKNOWN_MODULE_FORMAT' });
   }
   {
     const plainESMURL = 'data:text/javascript,export%20default%202';
@@ -112,4 +107,8 @@ function createBase64URL(mime, body) {
     const plainESMURL = `data:text/javascript,${encodeURIComponent(`import ${JSON.stringify(fixtures.fileURL('es-module-url', 'empty.js'))}`)}`;
     await import(plainESMURL);
   }
+  {
+    const plainESMURL = 'data:text/javascript,var x = "hello world?"';
+    await import(plainESMURL);
+  }
 })().then(common.mustCall());
diff --git a/test/parallel/test-data-url.js b/test/parallel/test-data-url.js
new file mode 100644
index 00000000000000..2615ff1b1b85a2
--- /dev/null
+++ b/test/parallel/test-data-url.js
@@ -0,0 +1,30 @@
+'use strict';
+// Flags: --expose-internals
+
+require('../common');
+const fixtures = require('../common/fixtures');
+const assert = require('node:assert');
+const { test } = require('node:test');
+const { dataURLProcessor } = require('internal/data_url');
+
+// https://github.com/web-platform-tests/wpt/blob/7c79d998ff42e52de90290cb847d1b515b3b58f7/fetch/data-urls/processing.any.js
+test('parsing data URLs', async () => {
+  const tests = require(fixtures.path('wpt/fetch/data-urls/resources/data-urls.json'));
+
+  for (let i = 0; i < tests.length; i++) {
+    const input = tests[i][0];
+    const expectedMimeType = tests[i][1];
+    const expectedBody = expectedMimeType !== null ? new Uint8Array(tests[i][2]) : null;
+
+    if (!URL.canParse(input)) {
+      assert.strictEqual(expectedMimeType, null);
+    } else if (expectedMimeType === null) {
+      assert.strictEqual(dataURLProcessor(URL.parse(input)), 'failure');
+    } else {
+      const { mimeType, body } = dataURLProcessor(new URL(input));
+
+      assert.deepStrictEqual(expectedBody, body);
+      assert.deepStrictEqual(expectedMimeType, mimeType.toString());
+    }
+  }
+});