Skip to content

Commit

Permalink
Fixed handling of unicode characters in the lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
andreabergia authored and rbri committed Apr 27, 2024
1 parent a3dfe64 commit b842510
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 48 deletions.
19 changes: 13 additions & 6 deletions src/org/mozilla/javascript/TokenStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,7 @@ final int getToken() throws IOException {
c = '\\';
}
} else {
identifierStart = Character.isJavaIdentifierStart((char) c);
identifierStart = Character.isUnicodeIdentifierStart(c) || c == '$' || c == '_';
if (identifierStart) {
stringBufferTop = 0;
addToString(c);
Expand Down Expand Up @@ -751,7 +751,7 @@ final int getToken() throws IOException {
} else {
if (c == EOF_CHAR
|| c == BYTE_ORDER_MARK
|| !Character.isJavaIdentifierPart((char) c)) {
|| !(Character.isUnicodeIdentifierPart(c) || c == '$')) {
break;
}
addToString(c);
Expand Down Expand Up @@ -2058,13 +2058,19 @@ private String getStringFromBuffer() {

private void addToString(int c) {
int N = stringBufferTop;
if (N == stringBuffer.length) {
int codePointLen = Character.charCount(c);
if (N + codePointLen >= stringBuffer.length) {
char[] tmp = new char[stringBuffer.length * 2];
System.arraycopy(stringBuffer, 0, tmp, 0, N);
stringBuffer = tmp;
}
stringBuffer[N] = (char) c;
stringBufferTop = N + 1;
if (codePointLen == 1) {
stringBuffer[N] = (char) c;
} else {
stringBuffer[N] = Character.highSurrogate(c);
stringBuffer[N + 1] = Character.lowSurrogate(c);
}
stringBufferTop = N + codePointLen;
}

private boolean canUngetChar() {
Expand Down Expand Up @@ -2116,7 +2122,8 @@ private int getChar(boolean skipFormattingChars, boolean ignoreLineEnd) throws I
return EOF_CHAR;
}
cursor++;
c = sourceString.charAt(sourceCursor++);
c = sourceString.codePointAt(sourceCursor);
sourceCursor += Character.charCount(c);
} else {
if (sourceCursor == sourceEnd) {
if (!fillSourceBuffer()) {
Expand Down
18 changes: 18 additions & 0 deletions testsrc/org/mozilla/javascript/tests/ParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1200,6 +1200,24 @@ public void parseUnicodeFormatName() {
assertEquals("AB", first.getString());
}

@Test
public void testParseUnicodeMultibyteCharacter() {
AstRoot root = parse("\uD842\uDFB7");
AstNode first = ((ExpressionStatement) root.getFirstChild()).getExpression();
assertEquals("𠮷", first.getString());
}

@Test
public void testParseUnicodeIdentifierPartWhichIsNotJavaIdentifierPart() {
// On the JDK 11 I'm using, Character.isUnicodeIdentifierPart(U+9FEB) returns true
// but Character.isJavaIdentifierPart(U+9FEB) returns false. On a JDK 17 results
// seem to vary, but I think it's enough to verify that TokenStream uses
// the unicode methods and not the java methods.
AstRoot root = parse("a\u9FEB");
AstNode first = ((ExpressionStatement) root.getFirstChild()).getExpression();
assertEquals("a鿫", first.getString());
}

@Test
public void parseUnicodeReservedKeywords1() {
AstRoot root = parse("\\u0069\\u0066");
Expand Down
43 changes: 1 addition & 42 deletions testsrc/test262.properties
Original file line number Diff line number Diff line change
Expand Up @@ -5027,48 +5027,7 @@ language/global-code 29/41 (70.73%)

language/identifier-resolution 0/13 (0.0%)

language/identifiers 45/188 (23.94%)
other_id_continue.js
other_id_continue-escaped.js
other_id_start.js
other_id_start-escaped.js
part-unicode-10.0.0.js
part-unicode-10.0.0-escaped.js
part-unicode-11.0.0.js
part-unicode-11.0.0-escaped.js
part-unicode-12.0.0.js
part-unicode-12.0.0-escaped.js
part-unicode-13.0.0.js
part-unicode-13.0.0-escaped.js
part-unicode-5.2.0.js
part-unicode-5.2.0-escaped.js
part-unicode-6.0.0.js
part-unicode-6.1.0.js
part-unicode-7.0.0.js
part-unicode-7.0.0-escaped.js
part-unicode-8.0.0.js
part-unicode-8.0.0-escaped.js
part-unicode-9.0.0.js
part-unicode-9.0.0-escaped.js
start-unicode-10.0.0.js
start-unicode-10.0.0-escaped.js
start-unicode-11.0.0.js
start-unicode-11.0.0-escaped.js
start-unicode-12.0.0.js
start-unicode-12.0.0-escaped.js
start-unicode-13.0.0.js
start-unicode-13.0.0-escaped.js
start-unicode-5.2.0.js
start-unicode-5.2.0-escaped.js
start-unicode-6.0.0.js
start-unicode-6.1.0.js
start-unicode-6.1.0-escaped.js
start-unicode-7.0.0.js
start-unicode-7.0.0-escaped.js
start-unicode-8.0.0.js
start-unicode-8.0.0-escaped.js
start-unicode-9.0.0.js
start-unicode-9.0.0-escaped.js
language/identifiers 4/188 (2.13%)
vertical-tilde-continue.js
vertical-tilde-continue-escaped.js
vertical-tilde-start.js
Expand Down

0 comments on commit b842510

Please sign in to comment.