From bd3e8dbf40203527d37e5bc87e655eae6a207ef3 Mon Sep 17 00:00:00 2001 From: rnetuka Date: Thu, 12 Sep 2024 13:12:11 +0200 Subject: [PATCH] Write 4-byte characters (surrogate pairs) instead of escapes --- .../fasterxml/jackson/core/JsonGenerator.java | 13 ++++++- .../jackson/core/json/UTF8JsonGenerator.java | 34 +++++++++++++++++-- .../json}/Surrogate223Test.java | 8 ++--- 3 files changed, 48 insertions(+), 7 deletions(-) rename src/test/java/com/fasterxml/jackson/{failing => core/json}/Surrogate223Test.java (91%) diff --git a/src/main/java/com/fasterxml/jackson/core/JsonGenerator.java b/src/main/java/com/fasterxml/jackson/core/JsonGenerator.java index f8ee301e2b..5cf22d17b7 100644 --- a/src/main/java/com/fasterxml/jackson/core/JsonGenerator.java +++ b/src/main/java/com/fasterxml/jackson/core/JsonGenerator.java @@ -275,7 +275,18 @@ public enum Feature { * * @since 2.17 */ - ESCAPE_FORWARD_SLASHES(false); + ESCAPE_FORWARD_SLASHES(false), + + /** + * Feature that specifies how 4-byte characters should be handled in {@link JsonGenerator}. If enabled, + * 4-byte characters made by surrogate pairs are combined and flushed as a single character encoded in UTF-8. + * If disabled, each pair is written as UTF-16 escape. + *

+ * Feature is disabled by default + * + * @since 2.18 + */ + COMBINE_UNICODE_SURROGATES(false); private final boolean _defaultState; private final int _mask; diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java index de47f9b48b..5e9aeb8773 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java +++ b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java @@ -3,6 +3,7 @@ import java.io.*; import java.math.BigDecimal; import java.math.BigInteger; +import java.nio.charset.StandardCharsets; import com.fasterxml.jackson.core.*; import com.fasterxml.jackson.core.io.CharTypes; @@ -659,6 +660,10 @@ public void writeUTF8String(byte[] text, int offset, int len) throws IOException _outputBuffer[_outputTail++] = _quoteChar; } + private boolean isSurrogatePair(char ch) { + return (ch & 0xD800) == 0xD800; + } + /* /********************************************************** /* Output method implementations, unprocessed ("raw") @@ -1489,6 +1494,8 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int final byte[] outputBuffer = _outputBuffer; final int[] escCodes = _outputEscapes; + boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES.enabledIn(_features); + while (offset < end) { int ch = cbuf[offset++]; if (ch <= 0x7F) { @@ -1510,7 +1517,14 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6)); outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f)); } else { - outputPtr = _outputMultiByteChar(ch, outputPtr); + // multibyte character + if (combineSurrogates && isSurrogatePair((char) ch) && offset < end) { + char highSurrogate = (char) ch; + char lowSurrogate = cbuf[offset++]; + outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr); + } else { + outputPtr = _outputMultiByteChar(ch, outputPtr); + } } } _outputTail = outputPtr; @@ -1527,6 +1541,8 @@ private final void _writeStringSegment2(final String text, int offset, final int final byte[] outputBuffer = _outputBuffer; final int[] escCodes = _outputEscapes; + boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES.enabledIn(_features); + while (offset < end) { int ch = text.charAt(offset++); if (ch <= 0x7F) { @@ -1548,7 +1564,14 @@ private final void _writeStringSegment2(final String text, int offset, final int outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6)); outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f)); } else { - outputPtr = _outputMultiByteChar(ch, outputPtr); + // multibyte character + if (combineSurrogates && isSurrogatePair((char) ch) && offset < end) { + char highSurrogate = (char) ch; + char lowSurrogate = text.charAt(offset++); + outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr); + } else { + outputPtr = _outputMultiByteChar(ch, outputPtr); + } } } _outputTail = outputPtr; @@ -2133,6 +2156,13 @@ protected final void _outputSurrogates(int surr1, int surr2) throws IOException bbuf[_outputTail++] = (byte) (0x80 | (c & 0x3f)); } + private int _outputSurrogatePair(char highSurrogate, char lowSurrogate, int outputPtr) { + String s = String.valueOf(highSurrogate) + lowSurrogate; + byte[] bytes = s.getBytes(StandardCharsets.UTF_8); + System.arraycopy(bytes, 0, _outputBuffer, outputPtr, bytes.length); + return outputPtr + bytes.length; + } + /** * * @param ch diff --git a/src/test/java/com/fasterxml/jackson/failing/Surrogate223Test.java b/src/test/java/com/fasterxml/jackson/core/json/Surrogate223Test.java similarity index 91% rename from src/test/java/com/fasterxml/jackson/failing/Surrogate223Test.java rename to src/test/java/com/fasterxml/jackson/core/json/Surrogate223Test.java index c1766cf987..30675c719e 100644 --- a/src/test/java/com/fasterxml/jackson/failing/Surrogate223Test.java +++ b/src/test/java/com/fasterxml/jackson/core/json/Surrogate223Test.java @@ -1,4 +1,4 @@ -package com.fasterxml.jackson.failing; +package com.fasterxml.jackson.core.json; import java.io.ByteArrayOutputStream; import java.io.StringWriter; @@ -8,6 +8,7 @@ import org.junit.jupiter.api.Test; +import static com.fasterxml.jackson.core.JsonGenerator.Feature; import static org.junit.jupiter.api.Assertions.assertEquals; class Surrogate223Test extends JUnit5TestBase @@ -27,7 +28,7 @@ void surrogatesByteBacked() throws Exception // assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES)); out = new ByteArrayOutputStream(); - g = JSON_F.createGenerator(out); + g = JSON_F.createGenerator(out).enable(Feature.COMBINE_UNICODE_SURROGATES); g.writeStartArray(); g.writeString(toQuote); g.writeEndArray(); @@ -43,8 +44,7 @@ void surrogatesByteBacked() throws Exception // but may revert back to original behavior out = new ByteArrayOutputStream(); - g = JSON_F.createGenerator(out); -// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES); + g = JSON_F.createGenerator(out).disable(Feature.COMBINE_UNICODE_SURROGATES); g.writeStartArray(); g.writeString(toQuote); g.writeEndArray();