Skip to content

Commit

Permalink
bpo-36346: Prepare for removing the legacy Unicode C API.
Browse files Browse the repository at this point in the history
Add two compile time options: HAVE_UNICODE_WCHAR_CACHE and
USE_UNICODE_WCHAR_CACHE.

USE_UNICODE_WCHAR_CACHE makes the interpreter not using the
wchar_t cache and the legacy Unicode C API.

HAVE_UNICODE_WCHAR_CACHE removes the wchar_t cache and the
legacy Unicode C API that depends on it.
  • Loading branch information
serhiy-storchaka committed Mar 18, 2019
1 parent f40b4a0 commit 699e616
Show file tree
Hide file tree
Showing 29 changed files with 1,237 additions and 336 deletions.
15 changes: 15 additions & 0 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ typedef wchar_t Py_UNICODE /* Py_DEPRECATED(3.3) */;

/* --- Internal Unicode Operations ---------------------------------------- */

#define HAVE_UNICODE_WCHAR_CACHE 1
#define USE_UNICODE_WCHAR_CACHE 1

/* Since splitting on whitespace is an important use case, and
whitespace in most situations is solely ASCII whitespace, we
optimize for the common case by using a quick look-up table
Expand Down Expand Up @@ -71,13 +74,15 @@ typedef wchar_t Py_UNICODE /* Py_DEPRECATED(3.3) */;
/* low surrogate = bottom 10 bits added to DC00 */
#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))

#if HAVE_UNICODE_WCHAR_CACHE
/* Check if substring matches at given offset. The offset must be
valid, and the substring must not be empty. */

#define Py_UNICODE_MATCH(string, offset, substring) \
((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
!memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
#endif /* HAVE_UNICODE_WCHAR_CACHE */

/* --- Unicode Type ------------------------------------------------------- */

Expand Down Expand Up @@ -218,7 +223,9 @@ typedef struct {
4 bytes (see issue #19537 on m68k). */
unsigned int :24;
} state;
#if HAVE_UNICODE_WCHAR_CACHE
wchar_t *wstr; /* wchar_t representation (null-terminated) */
#endif /* HAVE_UNICODE_WCHAR_CACHE */
} PyASCIIObject;

/* Non-ASCII strings allocated through PyUnicode_New use the
Expand All @@ -229,8 +236,10 @@ typedef struct {
Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
* terminating \0. */
char *utf8; /* UTF-8 representation (null-terminated) */
#if HAVE_UNICODE_WCHAR_CACHE
Py_ssize_t wstr_length; /* Number of code points in wstr, possible
* surrogates count as two code points. */
#endif /* HAVE_UNICODE_WCHAR_CACHE */
} PyCompactUnicodeObject;

/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
Expand All @@ -247,6 +256,8 @@ typedef struct {
} PyUnicodeObject;

/* Fast access macros */
#if HAVE_UNICODE_WCHAR_CACHE

#define PyUnicode_WSTR_LENGTH(op) \
(PyUnicode_IS_COMPACT_ASCII(op) ? \
((PyASCIIObject*)op)->length : \
Expand Down Expand Up @@ -285,6 +296,7 @@ typedef struct {
((const char *)(PyUnicode_AS_UNICODE(op)))
/* Py_DEPRECATED(3.3) */

#endif /* HAVE_UNICODE_WCHAR_CACHE */

/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */

Expand Down Expand Up @@ -1240,6 +1252,9 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
and where the hash values are equal (i.e. a very probable match) */
PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);

PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);
PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);

#ifdef __cplusplus
}
#endif
19 changes: 16 additions & 3 deletions Lib/test/clinic.test
Original file line number Diff line number Diff line change
Expand Up @@ -1871,13 +1871,26 @@ test_Py_UNICODE_converter(PyObject *module, PyObject *const *args, Py_ssize_t na
const Py_UNICODE *e;
Py_ssize_clean_t e_length;

if (!_PyArg_ParseStack(args, nargs, "uuZu#Z#:test_Py_UNICODE_converter",
&a, &b, &c, &d, &d_length, &e, &e_length)) {
if (!_PyArg_ParseStack(args, nargs, "O&O&O&u#Z#:test_Py_UNICODE_converter",
_PyUnicode_WideCharString_Converter, &a, _PyUnicode_WideCharString_Converter, &b, _PyUnicode_WideCharString_Opt_Converter, &c, &d, &d_length, &e, &e_length)) {
goto exit;
}
return_value = test_Py_UNICODE_converter_impl(module, a, b, c, d, d_length, e, e_length);

exit:
/* Cleanup for a */
#if !USE_UNICODE_WCHAR_CACHE
PyMem_Free((void *)a);
#endif /* USE_UNICODE_WCHAR_CACHE */
/* Cleanup for b */
#if !USE_UNICODE_WCHAR_CACHE
PyMem_Free((void *)b);
#endif /* USE_UNICODE_WCHAR_CACHE */
/* Cleanup for c */
#if !USE_UNICODE_WCHAR_CACHE
PyMem_Free((void *)c);
#endif /* USE_UNICODE_WCHAR_CACHE */

return return_value;
}

Expand All @@ -1888,7 +1901,7 @@ test_Py_UNICODE_converter_impl(PyObject *module, const Py_UNICODE *a,
Py_ssize_clean_t d_length,
const Py_UNICODE *e,
Py_ssize_clean_t e_length)
/*[clinic end generated code: output=dd0a09a1b772e57b input=064a3b68ad7f04b0]*/
/*[clinic end generated code: output=ef45e982fedf0b3d input=064a3b68ad7f04b0]*/


/*[clinic input]
Expand Down
8 changes: 8 additions & 0 deletions Lib/test/support/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@
except ImportError:
resource = None

try:
from _testcapi import unicode_legacy_string
except ImportError:
unicode_legacy_string = None

__all__ = [
# globals
"PIPE_MAX_SIZE", "verbose", "max_memuse", "use_resources", "failfast",
Expand Down Expand Up @@ -814,6 +819,9 @@ def dec(*args, **kwargs):

requires_lzma = unittest.skipUnless(lzma, 'requires lzma')

requires_legacy_unicode_capi = unittest.skipUnless(unicode_legacy_string,
'requires legacy Unicode C API')

is_jython = sys.platform.startswith('java')

is_android = hasattr(sys, 'getandroidapilevel')
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,9 @@ def test_writerows_with_none(self):
self.assertEqual(fileobj.read(), 'a\r\n""\r\n')

@support.cpython_only
@support.requires_legacy_unicode_capi
def test_writerows_legacy_strings(self):
import _testcapi

c = _testcapi.unicode_legacy_string('a')
with TemporaryFile("w+", newline='') as fileobj:
writer = csv.writer(fileobj)
Expand Down
7 changes: 5 additions & 2 deletions Lib/test/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@
import numbers
import locale
from test.support import (run_unittest, run_doctest, is_resource_enabled,
requires_IEEE_754, requires_docstrings)
requires_IEEE_754, requires_docstrings,
requires_legacy_unicode_capi)
from test.support import (import_fresh_module, TestFailed,
run_with_locale, cpython_only)
run_with_locale, cpython_only, get_attribute)
import random
import inspect
import threading
Expand Down Expand Up @@ -581,6 +582,7 @@ def test_explicit_from_string(self):
self.assertRaises(InvalidOperation, Decimal, "1_2_\u00003")

@cpython_only
@requires_legacy_unicode_capi
def test_from_legacy_strings(self):
import _testcapi
Decimal = self.decimal.Decimal
Expand Down Expand Up @@ -2816,6 +2818,7 @@ def test_none_args(self):
Overflow])

@cpython_only
@requires_legacy_unicode_capi
def test_from_legacy_strings(self):
import _testcapi
c = self.decimal.Context()
Expand Down
4 changes: 4 additions & 0 deletions Lib/test/test_getargs2.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,7 @@ def test_et_hash(self):
buf = bytearray()
self.assertRaises(ValueError, getargs_et_hash, 'abc\xe9', 'latin1', buf)

@support.requires_legacy_unicode_capi
def test_u(self):
from _testcapi import getargs_u
self.assertEqual(getargs_u('abc\xe9'), 'abc\xe9')
Expand All @@ -994,6 +995,7 @@ def test_u(self):
self.assertRaises(TypeError, getargs_u, memoryview(b'memoryview'))
self.assertRaises(TypeError, getargs_u, None)

@support.requires_legacy_unicode_capi
def test_u_hash(self):
from _testcapi import getargs_u_hash
self.assertEqual(getargs_u_hash('abc\xe9'), 'abc\xe9')
Expand All @@ -1003,6 +1005,7 @@ def test_u_hash(self):
self.assertRaises(TypeError, getargs_u_hash, memoryview(b'memoryview'))
self.assertRaises(TypeError, getargs_u_hash, None)

@support.requires_legacy_unicode_capi
def test_Z(self):
from _testcapi import getargs_Z
self.assertEqual(getargs_Z('abc\xe9'), 'abc\xe9')
Expand All @@ -1012,6 +1015,7 @@ def test_Z(self):
self.assertRaises(TypeError, getargs_Z, memoryview(b'memoryview'))
self.assertIsNone(getargs_Z(None))

@support.requires_legacy_unicode_capi
def test_Z_hash(self):
from _testcapi import getargs_Z_hash
self.assertEqual(getargs_Z_hash('abc\xe9'), 'abc\xe9')
Expand Down
2 changes: 2 additions & 0 deletions Lib/test/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -2832,6 +2832,7 @@ def test_copycharacters(self):
self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)

@support.cpython_only
@support.requires_legacy_unicode_capi
def test_encode_decimal(self):
from _testcapi import unicode_encodedecimal
self.assertEqual(unicode_encodedecimal('123'),
Expand All @@ -2848,6 +2849,7 @@ def test_encode_decimal(self):
unicode_encodedecimal, "123\u20ac", "replace")

@support.cpython_only
@support.requires_legacy_unicode_capi
def test_transform_decimal(self):
from _testcapi import unicode_transformdecimaltoascii as transform_decimal
self.assertEqual(transform_decimal('123'),
Expand Down
22 changes: 20 additions & 2 deletions Modules/_codecsmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -704,20 +704,38 @@ _codecs_unicode_internal_encode_impl(PyObject *module, PyObject *obj,
return NULL;

if (PyUnicode_Check(obj)) {
Py_UNICODE *u;
Py_ssize_t len, size;

if (PyUnicode_READY(obj) < 0)
return NULL;

u = PyUnicode_AsUnicodeAndSize(obj, &len);
#if USE_UNICODE_WCHAR_CACHE
Py_UNICODE *u = PyUnicode_AsUnicodeAndSize(obj, &len);
if (u == NULL)
return NULL;
if ((size_t)len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_UNICODE))
return PyErr_NoMemory();
size = len * sizeof(Py_UNICODE);
return codec_tuple(PyBytes_FromStringAndSize((const char*)u, size),
PyUnicode_GET_LENGTH(obj));
#else /* USE_UNICODE_WCHAR_CACHE */
len = PyUnicode_AsWideChar(obj, NULL, 0);
if (len < 0) {
return NULL;
}
assert(len > 0);
len--;
if ((size_t)len > (size_t)PY_SSIZE_T_MAX / sizeof(wchar_t)) {
return PyErr_NoMemory();
}
size = len * sizeof(wchar_t);
PyObject *bytes = PyBytes_FromStringAndSize(NULL, size);
if (bytes == NULL) {
return NULL;
}
PyUnicode_AsWideChar(obj, (wchar_t *)PyBytes_AS_STRING(bytes), len);
return codec_tuple(bytes, PyUnicode_GET_LENGTH(obj));
#endif /* USE_UNICODE_WCHAR_CACHE */
}
else {
Py_buffer view;
Expand Down
28 changes: 20 additions & 8 deletions Modules/_ctypes/_ctypes.c
Original file line number Diff line number Diff line change
Expand Up @@ -1293,7 +1293,6 @@ static int
WCharArray_set_value(CDataObject *self, PyObject *value, void *Py_UNUSED(ignored))
{
Py_ssize_t result = 0;
Py_UNICODE *wstr;
Py_ssize_t len;

if (value == NULL) {
Expand All @@ -1309,21 +1308,32 @@ WCharArray_set_value(CDataObject *self, PyObject *value, void *Py_UNUSED(ignored
} else
Py_INCREF(value);

wstr = PyUnicode_AsUnicodeAndSize(value, &len);
if (wstr == NULL)
#if USE_UNICODE_WCHAR_CACHE
len = PyUnicode_GetSize(value);
if (len < 0) {
Py_DECREF(value);
return -1;
}
#else /* USE_UNICODE_WCHAR_CACHE */
len = PyUnicode_AsWideChar(value, NULL, 0);
if (len < 0) {
Py_DECREF(value);
return -1;
}
assert(len > 0);
len--;
#endif /* USE_UNICODE_WCHAR_CACHE */
if ((size_t)len > self->b_size/sizeof(wchar_t)) {
PyErr_SetString(PyExc_ValueError,
"string too long");
result = -1;
goto done;
Py_DECREF(value);
return -1;
}
result = PyUnicode_AsWideChar(value,
(wchar_t *)self->b_ptr,
self->b_size/sizeof(wchar_t));
if (result >= 0 && (size_t)result < self->b_size/sizeof(wchar_t))
((wchar_t *)self->b_ptr)[result] = (wchar_t)0;
done:
Py_DECREF(value);

return result >= 0 ? 0 : -1;
Expand Down Expand Up @@ -3358,10 +3368,12 @@ _validate_paramflags(PyTypeObject *type, PyObject *paramflags)
for (i = 0; i < len; ++i) {
PyObject *item = PyTuple_GET_ITEM(paramflags, i);
int flag;
char *name;
PyObject *name = Py_None;
PyObject *defval;
PyObject *typ;
if (!PyArg_ParseTuple(item, "i|ZO", &flag, &name, &defval)) {
if (!PyArg_ParseTuple(item, "i|OO", &flag, &name, &defval) ||
!(name == Py_None || PyUnicode_Check(name)))
{
PyErr_SetString(PyExc_TypeError,
"paramflags must be a sequence of (int [,string [,value]]) tuples");
return 0;
Expand Down
10 changes: 8 additions & 2 deletions Modules/_ctypes/callproc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1274,22 +1274,28 @@ The handle may be used to locate exported functions in this\n\
module.\n";
static PyObject *load_library(PyObject *self, PyObject *args)
{
const WCHAR *name;
PyObject *nameobj;
PyObject *ignored;
HMODULE hMod;

if (!PyArg_ParseTuple(args, "U|O:LoadLibrary", &nameobj, &ignored))
return NULL;

name = _PyUnicode_AsUnicode(nameobj);
#if USE_UNICODE_WCHAR_CACHE
const WCHAR *name = _PyUnicode_AsUnicode(nameobj);
#else /* USE_UNICODE_WCHAR_CACHE */
WCHAR *name = PyUnicode_AsWideCharString(nameobj, NULL);
#endif /* USE_UNICODE_WCHAR_CACHE */
if (!name)
return NULL;

Py_BEGIN_ALLOW_THREADS
hMod = LoadLibraryW(name);
Py_END_ALLOW_THREADS

#if !USE_UNICODE_WCHAR_CACHE
PyMem_Free(name);
#endif /* USE_UNICODE_WCHAR_CACHE */
if (!hMod)
return PyErr_SetFromWindowsErr(GetLastError());
#ifdef _WIN64
Expand Down
Loading

0 comments on commit 699e616

Please sign in to comment.