Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-126024: optimize UTF-8 decoder for short non-ASCII string #126025

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Optimize decoding of short UTF-8 sequences containing non-ASCII characters
by approximately 15%.
260 changes: 246 additions & 14 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -1305,6 +1305,46 @@ _PyUnicode_Dump(PyObject *op)
}
#endif

// Simplified version of PyUnicode_New() that only creates ASCII strings.
// This function does not test if size == 0.
static PyObject *
ascii_new(Py_ssize_t size)
{
PyObject *obj;
void *data;
Py_ssize_t struct_size = sizeof(PyASCIIObject);

if (size > ((PY_SSIZE_T_MAX - struct_size) - 1)) {
return PyErr_NoMemory();
}

/* Duplicated allocation code from _PyObject_New() instead of a call to
* PyObject_New() so we are able to allocate space for the object and
* it's data buffer.
*/
obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1));
if (obj == NULL) {
return PyErr_NoMemory();
}
_PyObject_Init(obj, &PyUnicode_Type);

data = ((PyASCIIObject*)obj) + 1;

_PyUnicode_LENGTH(obj) = size;
_PyUnicode_HASH(obj) = -1;
_PyUnicode_STATE(obj).interned = 0;
_PyUnicode_STATE(obj).kind = PyUnicode_1BYTE_KIND;
_PyUnicode_STATE(obj).compact = 1;
_PyUnicode_STATE(obj).ascii = 1;
_PyUnicode_STATE(obj).statically_allocated = 0;
((char*)data)[size] = 0;

#ifdef Py_DEBUG
unicode_fill_invalid((PyObject*)obj, 0);
#endif
assert(_PyUnicode_CheckConsistency(obj, 0));
return obj;
}

PyObject *
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
Expand Down Expand Up @@ -2208,13 +2248,16 @@ _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
{
const unsigned char *s = (const unsigned char *)buffer;
PyObject *unicode;
if (size == 0) {
return unicode_get_empty();
}
if (size == 1) {
#ifdef Py_DEBUG
assert((unsigned char)s[0] < 128);
#endif
return get_latin1_char(s[0]);
}
unicode = PyUnicode_New(size, 127);
unicode = ascii_new(size);
if (!unicode)
return NULL;
memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
Expand Down Expand Up @@ -4978,12 +5021,17 @@ PyUnicode_DecodeUTF8(const char *s,
#include "stringlib/codecs.h"
#include "stringlib/undef.h"

#if (SIZEOF_SIZE_T == 8)
/* Mask to quickly check whether a C 'size_t' contains a
non-ASCII, UTF8-encoded char. */
#if (SIZEOF_SIZE_T == 8)
# define ASCII_CHAR_MASK 0x8080808080808080ULL
// used to count codepoints in UTF-8 string.
# define VECTOR_0101 0x0101010101010101ULL
# define VECTOR_00FF 0x00ff00ff00ff00ffULL
#elif (SIZEOF_SIZE_T == 4)
# define ASCII_CHAR_MASK 0x80808080U
# define VECTOR_0101 0x01010101U
# define VECTOR_00FF 0x00ff00ffU
#else
# error C 'size_t' size should be either 4 or 8!
#endif
Expand Down Expand Up @@ -5043,6 +5091,149 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
return p - start;
}

#if (defined(__clang__) || defined(__GNUC__))
#define HAS_CTZ 1
picnixz marked this conversation as resolved.
Show resolved Hide resolved
static inline unsigned int
ctz(size_t v)
{
return __builtin_ctzll((unsigned long long)v);
}
#elif defined(_MSC_VER)
#define HAS_CTZ 1
static inline unsigned int
ctz(size_t v)
{
unsigned long pos;
#if SIZEOF_SIZE_T == 4
_BitScanForward(&pos, v);
#else
_BitScanForward64(&pos, v);
#endif /* SIZEOF_SIZE_T */
return pos;
}
#endif

static Py_ssize_t
find_first_nonascii(const unsigned char *start, const unsigned char *end)
{
const unsigned char *p = start;

if (end - start > SIZEOF_SIZE_T + ALIGNOF_SIZE_T) {
methane marked this conversation as resolved.
Show resolved Hide resolved
while (!_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
if ((unsigned char)*p & 0x80) {
return p - start;
}
p++;
}
const unsigned char *e = end - SIZEOF_SIZE_T;
while (p <= e) {
size_t value = (*(const size_t *)p) & ASCII_CHAR_MASK;
if (value) {
#if PY_LITTLE_ENDIAN && HAS_CTZ
return p - start + (ctz(value) - 7) / 8;
methane marked this conversation as resolved.
Show resolved Hide resolved
#else
// big endian and minor compilers are difficult to test.
// fallback to per byte check.
break;
#endif
}
p += SIZEOF_SIZE_T;
}
}
#if HAS_CTZ
// This part looks bit tricky, but decoding short ASCII is super important.
// Since we copy from p to size_t manually, this part works fine with big endian.
while (p < end) {
size_t u = (size_t)(p[0]);
switch (end - p) {
default:
#if SIZEOF_SIZE_T == 8
u |= (size_t)(p[7]) << 56ull;
// fall through
case 7:
u |= (size_t)(p[6]) << 48ull;
// fall through
case 6:
u |= (size_t)(p[5]) << 40ull;
// fall through
case 5:
u |= (size_t)(p[4]) << 32ull;
// fall through
case 4:
#endif
u |= (size_t)(p[3]) << 24;
// fall through
case 3:
u |= (size_t)(p[2]) << 16;
// fall through
case 2:
u |= (size_t)(p[1]) << 8;
break;
case 1:
break;
}
if (u & ASCII_CHAR_MASK) {
return p - start + (ctz(u & ASCII_CHAR_MASK) - 7) / 8;
}
p += SIZEOF_SIZE_T;
}
return end - start;
#else
while (p < end) {
if ((unsigned char)*p & 0x80) {
break;
}
p++;
}
return p - start;
#endif
}

static inline int scalar_utf8_start_char(unsigned int ch)
{
// 0xxxxxxx or 11xxxxxx are first byte.
return (~ch >> 7 | ch >> 6) & 1;
}

static inline size_t vector_utf8_start_chars(size_t v)
{
return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
}

static Py_ssize_t utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
{
Py_ssize_t len = 0;

if (end - s > SIZEOF_SIZE_T + ALIGNOF_SIZE_T) {
while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
len += scalar_utf8_start_char(*s++);
}

while (s + SIZEOF_SIZE_T <= end) {
const unsigned char *e = end;
if (e - s > SIZEOF_SIZE_T * 255) {
e = s + SIZEOF_SIZE_T * 255;
}
Py_ssize_t vstart = 0;
while (s + SIZEOF_SIZE_T <= e) {
size_t v = *(size_t*)s;
size_t vs = vector_utf8_start_chars(v);
vstart += vs;
s += SIZEOF_SIZE_T;
}
vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
vstart += vstart >> 16;
#if SIZEOF_SIZE_T == 8
vstart += vstart >> 32;
#endif
len += vstart & 0x7ff;
}
}
while (s < end) {
len += scalar_utf8_start_char(*s++);
}
return len;
}

static int
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
Expand Down Expand Up @@ -5187,27 +5378,66 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
return get_latin1_char((unsigned char)s[0]);
}

// fast path: try ASCII string.
const char *starts = s;
const char *end = s + size;
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
// I don't know this check is necessary or not. But there is a test
// case that requires size=PY_SSIZE_T_MAX cause MemoryError.
if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
PyErr_NoMemory();
return NULL;
}
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
if (decoded == size) {

const char *starts = s;
const char *end = s + size;

Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
if (pos == size) { // fast path: ASCII string.
PyObject *u = ascii_new(size);
if (u == NULL) {
return NULL;
}
// memcpy(PyUnicode_1BYTE_DATA(u), s, size);
// bypass iscompact & isascii checks.
memcpy(_Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(u) + 1)), s, size);
if (consumed) {
*consumed = size;
}
return u;
}
s += decoded;
size -= decoded;

int maxchr = 127;
picnixz marked this conversation as resolved.
Show resolved Hide resolved
Py_ssize_t maxsize = size;

unsigned char ch = (unsigned char)(s[pos]);
// error handler other than strict may remove/replace the invalid byte.
// consumed != NULL allows 1~3 bytes remainings.
// 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
// otherwise: check the input and decide the maxchr and maxsize to reduce
// reallocation and copy.
if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
if (ch < 0xc4) { // latin1
maxchr = 0xff;
}
else if (ch < 0xf0) { // ucs2
maxchr = 0xffff;
}
else { // ucs4
maxchr = 0x10ffff;
}
}
PyObject *u = PyUnicode_New(maxsize, maxchr);
if (!u) {
return NULL;
}

// Use _PyUnicodeWriter after fast path is failed.
_PyUnicodeWriter writer;
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = decoded;
if (maxchr <= 255) {
memcpy(_PyUnicode_COMPACT_DATA(u), s, pos);
s += pos;
size -= pos;
writer.pos = pos;
}

if (unicode_decode_utf8_impl(&writer, starts, s, end,
error_handler, errors,
Expand Down Expand Up @@ -5267,7 +5497,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
const char *errors,
Py_ssize_t *consumed)
{
return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
return unicode_decode_utf8(s, size,
errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
errors, consumed);
}


Expand Down Expand Up @@ -7282,7 +7514,7 @@ PyUnicode_DecodeASCII(const char *s,
}

// Shortcut for simple case
PyObject *u = PyUnicode_New(size, 127);
PyObject *u = ascii_new(size);
if (u == NULL) {
return NULL;
}
Expand Down
Loading