bpo-36346: Prepare for removing the legacy Unicode C API.

Add two compile time options: HAVE_UNICODE_WCHAR_CACHE and USE_UNICODE_WCHAR_CACHE. USE_UNICODE_WCHAR_CACHE makes the interpreter not using the wchar_t cache and the legacy Unicode C API. HAVE_UNICODE_WCHAR_CACHE removes the wchar_t cache and the legacy Unicode C API that depends on it.
serhiy-storchaka · Mar 18, 2019 · 699e616 · 699e616
1 parent f40b4a0
commit 699e616
Show file tree

Hide file tree

Showing 29 changed files with 1,237 additions and 336 deletions.
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
@@ -15,6 +15,9 @@ typedef wchar_t Py_UNICODE /* Py_DEPRECATED(3.3) */;
 
 /* --- Internal Unicode Operations ---------------------------------------- */
 
+#define HAVE_UNICODE_WCHAR_CACHE 1
+#define USE_UNICODE_WCHAR_CACHE 1
+
 /* Since splitting on whitespace is an important use case, and
    whitespace in most situations is solely ASCII whitespace, we
    optimize for the common case by using a quick look-up table
@@ -71,13 +74,15 @@ typedef wchar_t Py_UNICODE /* Py_DEPRECATED(3.3) */;
 /* low surrogate = bottom 10 bits added to DC00 */
 #define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
 
+#if HAVE_UNICODE_WCHAR_CACHE
 /* Check if substring matches at given offset.  The offset must be
    valid, and the substring must not be empty. */
 
 #define Py_UNICODE_MATCH(string, offset, substring) \
     ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
      ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
      !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
+#endif /* HAVE_UNICODE_WCHAR_CACHE */
 
 /* --- Unicode Type ------------------------------------------------------- */
 
@@ -218,7 +223,9 @@ typedef struct {
            4 bytes (see issue #19537 on m68k). */
         unsigned int :24;
     } state;
+#if HAVE_UNICODE_WCHAR_CACHE
     wchar_t *wstr;              /* wchar_t representation (null-terminated) */
+#endif /* HAVE_UNICODE_WCHAR_CACHE */
 } PyASCIIObject;
 
 /* Non-ASCII strings allocated through PyUnicode_New use the
@@ -229,8 +236,10 @@ typedef struct {
     Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
                                  * terminating \0. */
     char *utf8;                 /* UTF-8 representation (null-terminated) */
+#if HAVE_UNICODE_WCHAR_CACHE
     Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
                                  * surrogates count as two code points. */
+#endif /* HAVE_UNICODE_WCHAR_CACHE */
 } PyCompactUnicodeObject;
 
 /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
@@ -247,6 +256,8 @@ typedef struct {
 } PyUnicodeObject;
 
 /* Fast access macros */
+#if HAVE_UNICODE_WCHAR_CACHE
+
 #define PyUnicode_WSTR_LENGTH(op) \
     (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
      ((PyASCIIObject*)op)->length :                    \
@@ -285,6 +296,7 @@ typedef struct {
     ((const char *)(PyUnicode_AS_UNICODE(op)))
     /* Py_DEPRECATED(3.3) */
 
+#endif /* HAVE_UNICODE_WCHAR_CACHE */
 
 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
 
@@ -1240,6 +1252,9 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
    and where the hash values are equal (i.e. a very probable match) */
 PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
 
+PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);
+PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Lib/test/clinic.test b/Lib/test/clinic.test
@@ -1871,13 +1871,26 @@ test_Py_UNICODE_converter(PyObject *module, PyObject *const *args, Py_ssize_t na
     const Py_UNICODE *e;
     Py_ssize_clean_t e_length;
 
-    if (!_PyArg_ParseStack(args, nargs, "uuZu#Z#:test_Py_UNICODE_converter",
-        &a, &b, &c, &d, &d_length, &e, &e_length)) {
+    if (!_PyArg_ParseStack(args, nargs, "O&O&O&u#Z#:test_Py_UNICODE_converter",
+        _PyUnicode_WideCharString_Converter, &a, _PyUnicode_WideCharString_Converter, &b, _PyUnicode_WideCharString_Opt_Converter, &c, &d, &d_length, &e, &e_length)) {
         goto exit;
     }
     return_value = test_Py_UNICODE_converter_impl(module, a, b, c, d, d_length, e, e_length);
 
 exit:
+    /* Cleanup for a */
+    #if !USE_UNICODE_WCHAR_CACHE
+    PyMem_Free((void *)a);
+    #endif /* USE_UNICODE_WCHAR_CACHE */
+    /* Cleanup for b */
+    #if !USE_UNICODE_WCHAR_CACHE
+    PyMem_Free((void *)b);
+    #endif /* USE_UNICODE_WCHAR_CACHE */
+    /* Cleanup for c */
+    #if !USE_UNICODE_WCHAR_CACHE
+    PyMem_Free((void *)c);
+    #endif /* USE_UNICODE_WCHAR_CACHE */
+
     return return_value;
 }
 
@@ -1888,7 +1901,7 @@ test_Py_UNICODE_converter_impl(PyObject *module, const Py_UNICODE *a,
                                Py_ssize_clean_t d_length,
                                const Py_UNICODE *e,
                                Py_ssize_clean_t e_length)
-/*[clinic end generated code: output=dd0a09a1b772e57b input=064a3b68ad7f04b0]*/
+/*[clinic end generated code: output=ef45e982fedf0b3d input=064a3b68ad7f04b0]*/
 
 
 /*[clinic input]

diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py
@@ -68,6 +68,11 @@
 except ImportError:
     resource = None
 
+try:
+    from _testcapi import unicode_legacy_string
+except ImportError:
+    unicode_legacy_string = None
+
 __all__ = [
     # globals
     "PIPE_MAX_SIZE", "verbose", "max_memuse", "use_resources", "failfast",
@@ -814,6 +819,9 @@ def dec(*args, **kwargs):
 
 requires_lzma = unittest.skipUnless(lzma, 'requires lzma')
 
+requires_legacy_unicode_capi = unittest.skipUnless(unicode_legacy_string,
+                        'requires legacy Unicode C API')
+
 is_jython = sys.platform.startswith('java')
 
 is_android = hasattr(sys, 'getandroidapilevel')

diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py
@@ -231,9 +231,9 @@ def test_writerows_with_none(self):
             self.assertEqual(fileobj.read(), 'a\r\n""\r\n')
 
     @support.cpython_only
+    @support.requires_legacy_unicode_capi
     def test_writerows_legacy_strings(self):
         import _testcapi
-
         c = _testcapi.unicode_legacy_string('a')
         with TemporaryFile("w+", newline='') as fileobj:
             writer = csv.writer(fileobj)

diff --git a/Lib/test/test_decimal.py b/Lib/test/test_decimal.py
@@ -33,9 +33,10 @@
 import numbers
 import locale
 from test.support import (run_unittest, run_doctest, is_resource_enabled,
-                          requires_IEEE_754, requires_docstrings)
+                          requires_IEEE_754, requires_docstrings,
+                          requires_legacy_unicode_capi)
 from test.support import (import_fresh_module, TestFailed,
-                          run_with_locale, cpython_only)
+                          run_with_locale, cpython_only, get_attribute)
 import random
 import inspect
 import threading
@@ -581,6 +582,7 @@ def test_explicit_from_string(self):
             self.assertRaises(InvalidOperation, Decimal, "1_2_\u00003")
 
     @cpython_only
+    @requires_legacy_unicode_capi
     def test_from_legacy_strings(self):
         import _testcapi
         Decimal = self.decimal.Decimal
@@ -2816,6 +2818,7 @@ def test_none_args(self):
                                               Overflow])
 
     @cpython_only
+    @requires_legacy_unicode_capi
     def test_from_legacy_strings(self):
         import _testcapi
         c = self.decimal.Context()

diff --git a/Lib/test/test_getargs2.py b/Lib/test/test_getargs2.py
@@ -985,6 +985,7 @@ def test_et_hash(self):
         buf = bytearray()
         self.assertRaises(ValueError, getargs_et_hash, 'abc\xe9', 'latin1', buf)
 
+    @support.requires_legacy_unicode_capi
     def test_u(self):
         from _testcapi import getargs_u
         self.assertEqual(getargs_u('abc\xe9'), 'abc\xe9')
@@ -994,6 +995,7 @@ def test_u(self):
         self.assertRaises(TypeError, getargs_u, memoryview(b'memoryview'))
         self.assertRaises(TypeError, getargs_u, None)
 
+    @support.requires_legacy_unicode_capi
     def test_u_hash(self):
         from _testcapi import getargs_u_hash
         self.assertEqual(getargs_u_hash('abc\xe9'), 'abc\xe9')
@@ -1003,6 +1005,7 @@ def test_u_hash(self):
         self.assertRaises(TypeError, getargs_u_hash, memoryview(b'memoryview'))
         self.assertRaises(TypeError, getargs_u_hash, None)
 
+    @support.requires_legacy_unicode_capi
     def test_Z(self):
         from _testcapi import getargs_Z
         self.assertEqual(getargs_Z('abc\xe9'), 'abc\xe9')
@@ -1012,6 +1015,7 @@ def test_Z(self):
         self.assertRaises(TypeError, getargs_Z, memoryview(b'memoryview'))
         self.assertIsNone(getargs_Z(None))
 
+    @support.requires_legacy_unicode_capi
     def test_Z_hash(self):
         from _testcapi import getargs_Z_hash
         self.assertEqual(getargs_Z_hash('abc\xe9'), 'abc\xe9')

diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
@@ -2832,6 +2832,7 @@ def test_copycharacters(self):
         self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
 
     @support.cpython_only
+    @support.requires_legacy_unicode_capi
     def test_encode_decimal(self):
         from _testcapi import unicode_encodedecimal
         self.assertEqual(unicode_encodedecimal('123'),
@@ -2848,6 +2849,7 @@ def test_encode_decimal(self):
             unicode_encodedecimal, "123\u20ac", "replace")
 
     @support.cpython_only
+    @support.requires_legacy_unicode_capi
     def test_transform_decimal(self):
         from _testcapi import unicode_transformdecimaltoascii as transform_decimal
         self.assertEqual(transform_decimal('123'),

diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
@@ -704,20 +704,38 @@ _codecs_unicode_internal_encode_impl(PyObject *module, PyObject *obj,
         return NULL;
 
     if (PyUnicode_Check(obj)) {
-        Py_UNICODE *u;
         Py_ssize_t len, size;
 
         if (PyUnicode_READY(obj) < 0)
             return NULL;
 
-        u = PyUnicode_AsUnicodeAndSize(obj, &len);
+#if USE_UNICODE_WCHAR_CACHE
+        Py_UNICODE *u = PyUnicode_AsUnicodeAndSize(obj, &len);
         if (u == NULL)
             return NULL;
         if ((size_t)len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_UNICODE))
             return PyErr_NoMemory();
         size = len * sizeof(Py_UNICODE);
         return codec_tuple(PyBytes_FromStringAndSize((const char*)u, size),
                            PyUnicode_GET_LENGTH(obj));
+#else /* USE_UNICODE_WCHAR_CACHE */
+        len = PyUnicode_AsWideChar(obj, NULL, 0);
+        if (len < 0) {
+            return NULL;
+        }
+        assert(len > 0);
+        len--;
+        if ((size_t)len > (size_t)PY_SSIZE_T_MAX / sizeof(wchar_t)) {
+            return PyErr_NoMemory();
+        }
+        size = len * sizeof(wchar_t);
+        PyObject *bytes = PyBytes_FromStringAndSize(NULL, size);
+        if (bytes == NULL) {
+            return NULL;
+        }
+        PyUnicode_AsWideChar(obj, (wchar_t *)PyBytes_AS_STRING(bytes), len);
+        return codec_tuple(bytes, PyUnicode_GET_LENGTH(obj));
+#endif /* USE_UNICODE_WCHAR_CACHE */
     }
     else {
         Py_buffer view;

diff --git a/Modules/_ctypes/_ctypes.c b/Modules/_ctypes/_ctypes.c
@@ -1293,7 +1293,6 @@ static int
 WCharArray_set_value(CDataObject *self, PyObject *value, void *Py_UNUSED(ignored))
 {
     Py_ssize_t result = 0;
-    Py_UNICODE *wstr;
     Py_ssize_t len;
 
     if (value == NULL) {
@@ -1309,21 +1308,32 @@ WCharArray_set_value(CDataObject *self, PyObject *value, void *Py_UNUSED(ignored
     } else
         Py_INCREF(value);
 
-    wstr = PyUnicode_AsUnicodeAndSize(value, &len);
-    if (wstr == NULL)
+#if USE_UNICODE_WCHAR_CACHE
+    len = PyUnicode_GetSize(value);
+    if (len < 0) {
+        Py_DECREF(value);
+        return -1;
+    }
+#else /* USE_UNICODE_WCHAR_CACHE */
+    len = PyUnicode_AsWideChar(value, NULL, 0);
+    if (len < 0) {
+        Py_DECREF(value);
         return -1;
+    }
+    assert(len > 0);
+    len--;
+#endif /* USE_UNICODE_WCHAR_CACHE */
     if ((size_t)len > self->b_size/sizeof(wchar_t)) {
         PyErr_SetString(PyExc_ValueError,
                         "string too long");
-        result = -1;
-        goto done;
+        Py_DECREF(value);
+        return -1;
     }
     result = PyUnicode_AsWideChar(value,
                                   (wchar_t *)self->b_ptr,
                                   self->b_size/sizeof(wchar_t));
     if (result >= 0 && (size_t)result < self->b_size/sizeof(wchar_t))
         ((wchar_t *)self->b_ptr)[result] = (wchar_t)0;
-  done:
     Py_DECREF(value);
 
     return result >= 0 ? 0 : -1;
@@ -3358,10 +3368,12 @@ _validate_paramflags(PyTypeObject *type, PyObject *paramflags)
     for (i = 0; i < len; ++i) {
         PyObject *item = PyTuple_GET_ITEM(paramflags, i);
         int flag;
-        char *name;
+        PyObject *name = Py_None;
         PyObject *defval;
         PyObject *typ;
-        if (!PyArg_ParseTuple(item, "i|ZO", &flag, &name, &defval)) {
+        if (!PyArg_ParseTuple(item, "i|OO", &flag, &name, &defval) ||
+            !(name == Py_None || PyUnicode_Check(name)))
+        {
             PyErr_SetString(PyExc_TypeError,
                    "paramflags must be a sequence of (int [,string [,value]]) tuples");
             return 0;

diff --git a/Modules/_ctypes/callproc.c b/Modules/_ctypes/callproc.c
@@ -1274,22 +1274,28 @@ The handle may be used to locate exported functions in this\n\
 module.\n";
 static PyObject *load_library(PyObject *self, PyObject *args)
 {
-    const WCHAR *name;
     PyObject *nameobj;
     PyObject *ignored;
     HMODULE hMod;
 
     if (!PyArg_ParseTuple(args, "U|O:LoadLibrary", &nameobj, &ignored))
         return NULL;
 
-    name = _PyUnicode_AsUnicode(nameobj);
+#if USE_UNICODE_WCHAR_CACHE
+    const WCHAR *name = _PyUnicode_AsUnicode(nameobj);
+#else /* USE_UNICODE_WCHAR_CACHE */
+    WCHAR *name = PyUnicode_AsWideCharString(nameobj, NULL);
+#endif /* USE_UNICODE_WCHAR_CACHE */
     if (!name)
         return NULL;
 
     Py_BEGIN_ALLOW_THREADS
     hMod = LoadLibraryW(name);
     Py_END_ALLOW_THREADS
 
+#if !USE_UNICODE_WCHAR_CACHE
+    PyMem_Free(name);
+#endif /* USE_UNICODE_WCHAR_CACHE */
     if (!hMod)
         return PyErr_SetFromWindowsErr(GetLastError());
 #ifdef _WIN64