Fixed handling of unicode characters in the lexer

The lexer was not handling correctly surrogate pairs. References: - https://tc39.es/ecma262/#prod-IdentifierName - https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF
mozilla · Apr 27, 2024 · b842510 · b842510
1 parent a3dfe64
commit b842510
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 48 deletions.
diff --git a/src/org/mozilla/javascript/TokenStream.java b/src/org/mozilla/javascript/TokenStream.java
@@ -685,7 +685,7 @@ final int getToken() throws IOException {
                     c = '\\';
                 }
             } else {
-                identifierStart = Character.isJavaIdentifierStart((char) c);
+                identifierStart = Character.isUnicodeIdentifierStart(c) || c == '$' || c == '_';
                 if (identifierStart) {
                     stringBufferTop = 0;
                     addToString(c);
@@ -751,7 +751,7 @@ final int getToken() throws IOException {
                         } else {
                             if (c == EOF_CHAR
                                     || c == BYTE_ORDER_MARK
-                                    || !Character.isJavaIdentifierPart((char) c)) {
+                                    || !(Character.isUnicodeIdentifierPart(c) || c == '$')) {
                                 break;
                             }
                             addToString(c);
@@ -2058,13 +2058,19 @@ private String getStringFromBuffer() {
 
     private void addToString(int c) {
         int N = stringBufferTop;
-        if (N == stringBuffer.length) {
+        int codePointLen = Character.charCount(c);
+        if (N + codePointLen >= stringBuffer.length) {
             char[] tmp = new char[stringBuffer.length * 2];
             System.arraycopy(stringBuffer, 0, tmp, 0, N);
             stringBuffer = tmp;
         }
-        stringBuffer[N] = (char) c;
-        stringBufferTop = N + 1;
+        if (codePointLen == 1) {
+            stringBuffer[N] = (char) c;
+        } else {
+            stringBuffer[N] = Character.highSurrogate(c);
+            stringBuffer[N + 1] = Character.lowSurrogate(c);
+        }
+        stringBufferTop = N + codePointLen;
     }
 
     private boolean canUngetChar() {
@@ -2116,7 +2122,8 @@ private int getChar(boolean skipFormattingChars, boolean ignoreLineEnd) throws I
                     return EOF_CHAR;
                 }
                 cursor++;
-                c = sourceString.charAt(sourceCursor++);
+                c = sourceString.codePointAt(sourceCursor);
+                sourceCursor += Character.charCount(c);
             } else {
                 if (sourceCursor == sourceEnd) {
                     if (!fillSourceBuffer()) {

diff --git a/testsrc/org/mozilla/javascript/tests/ParserTest.java b/testsrc/org/mozilla/javascript/tests/ParserTest.java
@@ -1200,6 +1200,24 @@ public void parseUnicodeFormatName() {
         assertEquals("AB", first.getString());
     }
 
+    @Test
+    public void testParseUnicodeMultibyteCharacter() {
+        AstRoot root = parse("\uD842\uDFB7");
+        AstNode first = ((ExpressionStatement) root.getFirstChild()).getExpression();
+        assertEquals("𠮷", first.getString());
+    }
+
+    @Test
+    public void testParseUnicodeIdentifierPartWhichIsNotJavaIdentifierPart() {
+        // On the JDK 11 I'm using, Character.isUnicodeIdentifierPart(U+9FEB) returns true
+        // but Character.isJavaIdentifierPart(U+9FEB) returns false. On a JDK 17 results
+        // seem to vary, but I think it's enough to verify that TokenStream uses
+        // the unicode methods and not the java methods.
+        AstRoot root = parse("a\u9FEB");
+        AstNode first = ((ExpressionStatement) root.getFirstChild()).getExpression();
+        assertEquals("a鿫", first.getString());
+    }
+
     @Test
     public void parseUnicodeReservedKeywords1() {
         AstRoot root = parse("\\u0069\\u0066");

diff --git a/testsrc/test262.properties b/testsrc/test262.properties
@@ -5027,48 +5027,7 @@ language/global-code 29/41 (70.73%)
 
 language/identifier-resolution 0/13 (0.0%)
 
-language/identifiers 45/188 (23.94%)
-    other_id_continue.js
-    other_id_continue-escaped.js
-    other_id_start.js
-    other_id_start-escaped.js
-    part-unicode-10.0.0.js
-    part-unicode-10.0.0-escaped.js
-    part-unicode-11.0.0.js
-    part-unicode-11.0.0-escaped.js
-    part-unicode-12.0.0.js
-    part-unicode-12.0.0-escaped.js
-    part-unicode-13.0.0.js
-    part-unicode-13.0.0-escaped.js
-    part-unicode-5.2.0.js
-    part-unicode-5.2.0-escaped.js
-    part-unicode-6.0.0.js
-    part-unicode-6.1.0.js
-    part-unicode-7.0.0.js
-    part-unicode-7.0.0-escaped.js
-    part-unicode-8.0.0.js
-    part-unicode-8.0.0-escaped.js
-    part-unicode-9.0.0.js
-    part-unicode-9.0.0-escaped.js
-    start-unicode-10.0.0.js
-    start-unicode-10.0.0-escaped.js
-    start-unicode-11.0.0.js
-    start-unicode-11.0.0-escaped.js
-    start-unicode-12.0.0.js
-    start-unicode-12.0.0-escaped.js
-    start-unicode-13.0.0.js
-    start-unicode-13.0.0-escaped.js
-    start-unicode-5.2.0.js
-    start-unicode-5.2.0-escaped.js
-    start-unicode-6.0.0.js
-    start-unicode-6.1.0.js
-    start-unicode-6.1.0-escaped.js
-    start-unicode-7.0.0.js
-    start-unicode-7.0.0-escaped.js
-    start-unicode-8.0.0.js
-    start-unicode-8.0.0-escaped.js
-    start-unicode-9.0.0.js
-    start-unicode-9.0.0-escaped.js
+language/identifiers 4/188 (2.13%)
     vertical-tilde-continue.js
     vertical-tilde-continue-escaped.js
     vertical-tilde-start.js