From b842510405b42af041adb89b36bbba01ca028612 Mon Sep 17 00:00:00 2001 From: "andrea.bergia" Date: Thu, 18 Apr 2024 10:46:39 +0200 Subject: [PATCH] Fixed handling of unicode characters in the lexer The lexer was not handling correctly surrogate pairs. References: - https://tc39.es/ecma262/#prod-IdentifierName - https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF --- src/org/mozilla/javascript/TokenStream.java | 19 +++++--- .../mozilla/javascript/tests/ParserTest.java | 18 ++++++++ testsrc/test262.properties | 43 +------------------ 3 files changed, 32 insertions(+), 48 deletions(-) diff --git a/src/org/mozilla/javascript/TokenStream.java b/src/org/mozilla/javascript/TokenStream.java index e84d1c9a07..029b16a341 100644 --- a/src/org/mozilla/javascript/TokenStream.java +++ b/src/org/mozilla/javascript/TokenStream.java @@ -685,7 +685,7 @@ final int getToken() throws IOException { c = '\\'; } } else { - identifierStart = Character.isJavaIdentifierStart((char) c); + identifierStart = Character.isUnicodeIdentifierStart(c) || c == '$' || c == '_'; if (identifierStart) { stringBufferTop = 0; addToString(c); @@ -751,7 +751,7 @@ final int getToken() throws IOException { } else { if (c == EOF_CHAR || c == BYTE_ORDER_MARK - || !Character.isJavaIdentifierPart((char) c)) { + || !(Character.isUnicodeIdentifierPart(c) || c == '$')) { break; } addToString(c); @@ -2058,13 +2058,19 @@ private String getStringFromBuffer() { private void addToString(int c) { int N = stringBufferTop; - if (N == stringBuffer.length) { + int codePointLen = Character.charCount(c); + if (N + codePointLen >= stringBuffer.length) { char[] tmp = new char[stringBuffer.length * 2]; System.arraycopy(stringBuffer, 0, tmp, 0, N); stringBuffer = tmp; } - stringBuffer[N] = (char) c; - stringBufferTop = N + 1; + if (codePointLen == 1) { + stringBuffer[N] = (char) c; + } else { + stringBuffer[N] = Character.highSurrogate(c); + stringBuffer[N + 1] = Character.lowSurrogate(c); + } + stringBufferTop = N + codePointLen; } private boolean canUngetChar() { @@ -2116,7 +2122,8 @@ private int getChar(boolean skipFormattingChars, boolean ignoreLineEnd) throws I return EOF_CHAR; } cursor++; - c = sourceString.charAt(sourceCursor++); + c = sourceString.codePointAt(sourceCursor); + sourceCursor += Character.charCount(c); } else { if (sourceCursor == sourceEnd) { if (!fillSourceBuffer()) { diff --git a/testsrc/org/mozilla/javascript/tests/ParserTest.java b/testsrc/org/mozilla/javascript/tests/ParserTest.java index 2d15b0d16b..59b6b6a276 100644 --- a/testsrc/org/mozilla/javascript/tests/ParserTest.java +++ b/testsrc/org/mozilla/javascript/tests/ParserTest.java @@ -1200,6 +1200,24 @@ public void parseUnicodeFormatName() { assertEquals("AB", first.getString()); } + @Test + public void testParseUnicodeMultibyteCharacter() { + AstRoot root = parse("\uD842\uDFB7"); + AstNode first = ((ExpressionStatement) root.getFirstChild()).getExpression(); + assertEquals("𠮷", first.getString()); + } + + @Test + public void testParseUnicodeIdentifierPartWhichIsNotJavaIdentifierPart() { + // On the JDK 11 I'm using, Character.isUnicodeIdentifierPart(U+9FEB) returns true + // but Character.isJavaIdentifierPart(U+9FEB) returns false. On a JDK 17 results + // seem to vary, but I think it's enough to verify that TokenStream uses + // the unicode methods and not the java methods. + AstRoot root = parse("a\u9FEB"); + AstNode first = ((ExpressionStatement) root.getFirstChild()).getExpression(); + assertEquals("a鿫", first.getString()); + } + @Test public void parseUnicodeReservedKeywords1() { AstRoot root = parse("\\u0069\\u0066"); diff --git a/testsrc/test262.properties b/testsrc/test262.properties index 35502d398c..c2729b5258 100644 --- a/testsrc/test262.properties +++ b/testsrc/test262.properties @@ -5027,48 +5027,7 @@ language/global-code 29/41 (70.73%) language/identifier-resolution 0/13 (0.0%) -language/identifiers 45/188 (23.94%) - other_id_continue.js - other_id_continue-escaped.js - other_id_start.js - other_id_start-escaped.js - part-unicode-10.0.0.js - part-unicode-10.0.0-escaped.js - part-unicode-11.0.0.js - part-unicode-11.0.0-escaped.js - part-unicode-12.0.0.js - part-unicode-12.0.0-escaped.js - part-unicode-13.0.0.js - part-unicode-13.0.0-escaped.js - part-unicode-5.2.0.js - part-unicode-5.2.0-escaped.js - part-unicode-6.0.0.js - part-unicode-6.1.0.js - part-unicode-7.0.0.js - part-unicode-7.0.0-escaped.js - part-unicode-8.0.0.js - part-unicode-8.0.0-escaped.js - part-unicode-9.0.0.js - part-unicode-9.0.0-escaped.js - start-unicode-10.0.0.js - start-unicode-10.0.0-escaped.js - start-unicode-11.0.0.js - start-unicode-11.0.0-escaped.js - start-unicode-12.0.0.js - start-unicode-12.0.0-escaped.js - start-unicode-13.0.0.js - start-unicode-13.0.0-escaped.js - start-unicode-5.2.0.js - start-unicode-5.2.0-escaped.js - start-unicode-6.0.0.js - start-unicode-6.1.0.js - start-unicode-6.1.0-escaped.js - start-unicode-7.0.0.js - start-unicode-7.0.0-escaped.js - start-unicode-8.0.0.js - start-unicode-8.0.0-escaped.js - start-unicode-9.0.0.js - start-unicode-9.0.0-escaped.js +language/identifiers 4/188 (2.13%) vertical-tilde-continue.js vertical-tilde-continue-escaped.js vertical-tilde-start.js