Skip to content

Commit

Permalink
[3.13] pythongh-52551: Fix encoding issues in strftime() (pythonGH-12…
Browse files Browse the repository at this point in the history
…5193)

Fix time.strftime(), the strftime() method and formatting of the
datetime classes datetime, date and time.

* Characters not encodable in the current locale are now acceptable in
  the format string.
* Surrogate pairs and sequence of surrogatescape-encoded bytes are no
  longer recombinated.
* Embedded null character no longer terminates the format string.

This fixes also pythongh-78662 and pythongh-124531.
(cherry picked from commit ad3eac1)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
  • Loading branch information
serhiy-storchaka committed Oct 17, 2024
1 parent 7a2db76 commit 13ef787
Show file tree
Hide file tree
Showing 5 changed files with 291 additions and 211 deletions.
63 changes: 55 additions & 8 deletions Lib/test/datetimetester.py
Original file line number Diff line number Diff line change
Expand Up @@ -2855,11 +2855,32 @@ def test_more_strftime(self):
self.assertEqual(t.strftime("%z"), "-0200" + z)
self.assertEqual(t.strftime("%:z"), "-02:00:" + z)

# bpo-34482: Check that surrogates don't cause a crash.
try:
t.strftime('%y\ud800%m %H\ud800%M')
except UnicodeEncodeError:
pass
def test_strftime_special(self):
t = self.theclass(2004, 12, 31, 6, 22, 33, 47)
s1 = t.strftime('%c')
s2 = t.strftime('%B')
# gh-52551, gh-78662: Unicode strings should pass through strftime,
# independently from locale.
self.assertEqual(t.strftime('\U0001f40d'), '\U0001f40d')
self.assertEqual(t.strftime('\U0001f4bb%c\U0001f40d%B'), f'\U0001f4bb{s1}\U0001f40d{s2}')
self.assertEqual(t.strftime('%c\U0001f4bb%B\U0001f40d'), f'{s1}\U0001f4bb{s2}\U0001f40d')
# Lone surrogates should pass through.
self.assertEqual(t.strftime('\ud83d'), '\ud83d')
self.assertEqual(t.strftime('\udc0d'), '\udc0d')
self.assertEqual(t.strftime('\ud83d%c\udc0d%B'), f'\ud83d{s1}\udc0d{s2}')
self.assertEqual(t.strftime('%c\ud83d%B\udc0d'), f'{s1}\ud83d{s2}\udc0d')
self.assertEqual(t.strftime('%c\udc0d%B\ud83d'), f'{s1}\udc0d{s2}\ud83d')
# Surrogate pairs should not recombine.
self.assertEqual(t.strftime('\ud83d\udc0d'), '\ud83d\udc0d')
self.assertEqual(t.strftime('%c\ud83d\udc0d%B'), f'{s1}\ud83d\udc0d{s2}')
# Surrogate-escaped bytes should not recombine.
self.assertEqual(t.strftime('\udcf0\udc9f\udc90\udc8d'), '\udcf0\udc9f\udc90\udc8d')
self.assertEqual(t.strftime('%c\udcf0\udc9f\udc90\udc8d%B'), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
# gh-124531: The null character should not terminate the format string.
self.assertEqual(t.strftime('\0'), '\0')
self.assertEqual(t.strftime('\0'*1000), '\0'*1000)
self.assertEqual(t.strftime('\0%c\0%B'), f'\0{s1}\0{s2}')
self.assertEqual(t.strftime('%c\0%B\0'), f'{s1}\0{s2}\0')

def test_extract(self):
dt = self.theclass(2002, 3, 4, 18, 45, 3, 1234)
Expand Down Expand Up @@ -3633,6 +3654,33 @@ def test_strftime(self):
# gh-85432: The parameter was named "fmt" in the pure-Python impl.
t.strftime(format="%f")

def test_strftime_special(self):
t = self.theclass(1, 2, 3, 4)
s1 = t.strftime('%I%p%Z')
s2 = t.strftime('%X')
# gh-52551, gh-78662: Unicode strings should pass through strftime,
# independently from locale.
self.assertEqual(t.strftime('\U0001f40d'), '\U0001f40d')
self.assertEqual(t.strftime('\U0001f4bb%I%p%Z\U0001f40d%X'), f'\U0001f4bb{s1}\U0001f40d{s2}')
self.assertEqual(t.strftime('%I%p%Z\U0001f4bb%X\U0001f40d'), f'{s1}\U0001f4bb{s2}\U0001f40d')
# Lone surrogates should pass through.
self.assertEqual(t.strftime('\ud83d'), '\ud83d')
self.assertEqual(t.strftime('\udc0d'), '\udc0d')
self.assertEqual(t.strftime('\ud83d%I%p%Z\udc0d%X'), f'\ud83d{s1}\udc0d{s2}')
self.assertEqual(t.strftime('%I%p%Z\ud83d%X\udc0d'), f'{s1}\ud83d{s2}\udc0d')
self.assertEqual(t.strftime('%I%p%Z\udc0d%X\ud83d'), f'{s1}\udc0d{s2}\ud83d')
# Surrogate pairs should not recombine.
self.assertEqual(t.strftime('\ud83d\udc0d'), '\ud83d\udc0d')
self.assertEqual(t.strftime('%I%p%Z\ud83d\udc0d%X'), f'{s1}\ud83d\udc0d{s2}')
# Surrogate-escaped bytes should not recombine.
self.assertEqual(t.strftime('\udcf0\udc9f\udc90\udc8d'), '\udcf0\udc9f\udc90\udc8d')
self.assertEqual(t.strftime('%I%p%Z\udcf0\udc9f\udc90\udc8d%X'), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
# gh-124531: The null character should not terminate the format string.
self.assertEqual(t.strftime('\0'), '\0')
self.assertEqual(t.strftime('\0'*1000), '\0'*1000)
self.assertEqual(t.strftime('\0%I%p%Z\0%X'), f'\0{s1}\0{s2}')
self.assertEqual(t.strftime('%I%p%Z\0%X\0'), f'{s1}\0{s2}\0')

def test_format(self):
t = self.theclass(1, 2, 3, 4)
self.assertEqual(t.__format__(''), str(t))
Expand Down Expand Up @@ -4084,9 +4132,8 @@ def tzname(self, dt): return self.tz
self.assertRaises(TypeError, t.strftime, "%Z")

# Issue #6697:
if '_Fast' in self.__class__.__name__:
Badtzname.tz = '\ud800'
self.assertRaises(ValueError, t.strftime, "%Z")
Badtzname.tz = '\ud800'
self.assertEqual(t.strftime("%Z"), '\ud800')

def test_hash_edge_cases(self):
# Offsets that overflow a basic time.
Expand Down
29 changes: 27 additions & 2 deletions Lib/test/test_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,33 @@ def test_strftime(self):
self.fail('conversion specifier: %r failed.' % format)

self.assertRaises(TypeError, time.strftime, b'%S', tt)
# embedded null character
self.assertRaises(ValueError, time.strftime, '%S\0', tt)

def test_strftime_special(self):
tt = time.gmtime(self.t)
s1 = time.strftime('%c', tt)
s2 = time.strftime('%B', tt)
# gh-52551, gh-78662: Unicode strings should pass through strftime,
# independently from locale.
self.assertEqual(time.strftime('\U0001f40d', tt), '\U0001f40d')
self.assertEqual(time.strftime('\U0001f4bb%c\U0001f40d%B', tt), f'\U0001f4bb{s1}\U0001f40d{s2}')
self.assertEqual(time.strftime('%c\U0001f4bb%B\U0001f40d', tt), f'{s1}\U0001f4bb{s2}\U0001f40d')
# Lone surrogates should pass through.
self.assertEqual(time.strftime('\ud83d', tt), '\ud83d')
self.assertEqual(time.strftime('\udc0d', tt), '\udc0d')
self.assertEqual(time.strftime('\ud83d%c\udc0d%B', tt), f'\ud83d{s1}\udc0d{s2}')
self.assertEqual(time.strftime('%c\ud83d%B\udc0d', tt), f'{s1}\ud83d{s2}\udc0d')
self.assertEqual(time.strftime('%c\udc0d%B\ud83d', tt), f'{s1}\udc0d{s2}\ud83d')
# Surrogate pairs should not recombine.
self.assertEqual(time.strftime('\ud83d\udc0d', tt), '\ud83d\udc0d')
self.assertEqual(time.strftime('%c\ud83d\udc0d%B', tt), f'{s1}\ud83d\udc0d{s2}')
# Surrogate-escaped bytes should not recombine.
self.assertEqual(time.strftime('\udcf0\udc9f\udc90\udc8d', tt), '\udcf0\udc9f\udc90\udc8d')
self.assertEqual(time.strftime('%c\udcf0\udc9f\udc90\udc8d%B', tt), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
# gh-124531: The null character should not terminate the format string.
self.assertEqual(time.strftime('\0', tt), '\0')
self.assertEqual(time.strftime('\0'*1000, tt), '\0'*1000)
self.assertEqual(time.strftime('\0%c\0%B', tt), f'\0{s1}\0{s2}')
self.assertEqual(time.strftime('%c\0%B\0', tt), f'{s1}\0{s2}\0')

def _bounds_checking(self, func):
# Make sure that strftime() checks the bounds of the various parts
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Fix encoding issues in :func:`time.strftime`, the
:meth:`~datetime.datetime.strftime` method of the :mod:`datetime` classes
:class:`~datetime.datetime`, :class:`~datetime.date` and
:class:`~datetime.time` and formatting of these classes. Characters not
encodable in the current locale are now acceptable in the format string.
Surrogate pairs and sequence of surrogatescape-encoded bytes are no longer
recombinated. Embedded null character no longer terminates the format
string.
169 changes: 67 additions & 102 deletions Modules/_datetimemodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -1746,7 +1746,7 @@ make_somezreplacement(PyObject *object, char *sep, PyObject *tzinfoarg)
PyObject *tzinfo = get_tzinfo_member(object);

if (tzinfo == Py_None || tzinfo == NULL) {
return PyBytes_FromStringAndSize(NULL, 0);
return PyUnicode_FromStringAndSize(NULL, 0);
}

assert(tzinfoarg != NULL);
Expand All @@ -1757,7 +1757,7 @@ make_somezreplacement(PyObject *object, char *sep, PyObject *tzinfoarg)
tzinfoarg) < 0)
return NULL;

return PyBytes_FromStringAndSize(buf, strlen(buf));
return PyUnicode_FromString(buf);
}

static PyObject *
Expand Down Expand Up @@ -1814,7 +1814,7 @@ make_freplacement(PyObject *object)
else
sprintf(freplacement, "%06d", 0);

return PyBytes_FromStringAndSize(freplacement, strlen(freplacement));
return PyUnicode_FromString(freplacement);
}

/* I sure don't want to reproduce the strftime code from the time module,
Expand All @@ -1835,159 +1835,124 @@ wrap_strftime(PyObject *object, PyObject *format, PyObject *timetuple,
PyObject *Zreplacement = NULL; /* py string, replacement for %Z */
PyObject *freplacement = NULL; /* py string, replacement for %f */

const char *pin; /* pointer to next char in input format */
Py_ssize_t flen; /* length of input format */
char ch; /* next char in input format */

PyObject *newfmt = NULL; /* py string, the output format */
char *pnew; /* pointer to available byte in output format */
size_t totalnew; /* number bytes total in output format buffer,
exclusive of trailing \0 */
size_t usednew; /* number bytes used so far in output format buffer */

const char *ptoappend; /* ptr to string to append to output buffer */
Py_ssize_t ntoappend; /* # of bytes to append to output buffer */

assert(object && format && timetuple);
assert(PyUnicode_Check(format));
/* Convert the input format to a C string and size */
pin = PyUnicode_AsUTF8AndSize(format, &flen);
if (!pin)

PyObject *strftime = _PyImport_GetModuleAttrString("time", "strftime");
if (strftime == NULL) {
return NULL;
}

/* Scan the input format, looking for %z/%Z/%f escapes, building
* a new format. Since computing the replacements for those codes
* is expensive, don't unless they're actually used.
*/
if (flen > INT_MAX - 1) {
PyErr_NoMemory();
goto Done;
}

totalnew = flen + 1; /* realistic if no %z/%Z */
newfmt = PyBytes_FromStringAndSize(NULL, totalnew);
if (newfmt == NULL) goto Done;
pnew = PyBytes_AsString(newfmt);
usednew = 0;

while ((ch = *pin++) != '\0') {
if (ch != '%') {
ptoappend = pin - 1;
ntoappend = 1;
_PyUnicodeWriter writer;
_PyUnicodeWriter_Init(&writer);
writer.overallocate = 1;

Py_ssize_t flen = PyUnicode_GET_LENGTH(format);
Py_ssize_t i = 0;
Py_ssize_t start = 0;
Py_ssize_t end = 0;
while (i != flen) {
i = PyUnicode_FindChar(format, '%', i, flen, 1);
if (i < 0) {
assert(!PyErr_Occurred());
break;
}
else if ((ch = *pin++) == '\0') {
/* Null byte follows %, copy only '%'.
*
* Back the pin up one char so that we catch the null check
* the next time through the loop.*/
pin--;
ptoappend = pin - 1;
ntoappend = 1;
end = i;
i++;
if (i == flen) {
break;
}
Py_UCS4 ch = PyUnicode_READ_CHAR(format, i);
i++;
/* A % has been seen and ch is the character after it. */
else if (ch == 'z') {
PyObject *replacement = NULL;
if (ch == 'z') {
/* %z -> +HHMM */
if (zreplacement == NULL) {
zreplacement = make_somezreplacement(object, "", tzinfoarg);
if (zreplacement == NULL)
goto Done;
goto Error;
}
assert(zreplacement != NULL);
assert(PyBytes_Check(zreplacement));
ptoappend = PyBytes_AS_STRING(zreplacement);
ntoappend = PyBytes_GET_SIZE(zreplacement);
replacement = zreplacement;
}
else if (ch == ':' && *pin == 'z' && pin++) {
else if (ch == ':' && i < flen && PyUnicode_READ_CHAR(format, i) == 'z') {
/* %:z -> +HH:MM */
i++;
if (colonzreplacement == NULL) {
colonzreplacement = make_somezreplacement(object, ":", tzinfoarg);
if (colonzreplacement == NULL)
goto Done;
goto Error;
}
assert(colonzreplacement != NULL);
assert(PyBytes_Check(colonzreplacement));
ptoappend = PyBytes_AS_STRING(colonzreplacement);
ntoappend = PyBytes_GET_SIZE(colonzreplacement);
replacement = colonzreplacement;
}
else if (ch == 'Z') {
/* format tzname */
if (Zreplacement == NULL) {
Zreplacement = make_Zreplacement(object,
tzinfoarg);
if (Zreplacement == NULL)
goto Done;
goto Error;
}
assert(Zreplacement != NULL);
assert(PyUnicode_Check(Zreplacement));
ptoappend = PyUnicode_AsUTF8AndSize(Zreplacement,
&ntoappend);
if (ptoappend == NULL)
goto Done;
replacement = Zreplacement;
}
else if (ch == 'f') {
/* format microseconds */
if (freplacement == NULL) {
freplacement = make_freplacement(object);
if (freplacement == NULL)
goto Done;
goto Error;
}
assert(freplacement != NULL);
assert(PyBytes_Check(freplacement));
ptoappend = PyBytes_AS_STRING(freplacement);
ntoappend = PyBytes_GET_SIZE(freplacement);
replacement = freplacement;
}
else {
/* percent followed by something else */
ptoappend = pin - 2;
ntoappend = 2;
}

/* Append the ntoappend chars starting at ptoappend to
* the new format.
*/
if (ntoappend == 0)
continue;
assert(ptoappend != NULL);
assert(ntoappend > 0);
while (usednew + ntoappend > totalnew) {
if (totalnew > (PY_SSIZE_T_MAX >> 1)) { /* overflow */
PyErr_NoMemory();
goto Done;
}
totalnew <<= 1;
if (_PyBytes_Resize(&newfmt, totalnew) < 0)
goto Done;
pnew = PyBytes_AsString(newfmt) + usednew;
}
memcpy(pnew, ptoappend, ntoappend);
pnew += ntoappend;
usednew += ntoappend;
assert(usednew <= totalnew);
assert(replacement != NULL);
assert(PyUnicode_Check(replacement));
if (_PyUnicodeWriter_WriteSubstring(&writer, format, start, end) < 0) {
goto Error;
}
start = i;
if (_PyUnicodeWriter_WriteStr(&writer, replacement) < 0) {
goto Error;
}
} /* end while() */

if (_PyBytes_Resize(&newfmt, usednew) < 0)
goto Done;
{
PyObject *format;
PyObject *strftime = _PyImport_GetModuleAttrString("time", "strftime");

if (strftime == NULL)
PyObject *newformat;
if (start == 0) {
_PyUnicodeWriter_Dealloc(&writer);
newformat = Py_NewRef(format);
}
else {
if (_PyUnicodeWriter_WriteSubstring(&writer, format, start, flen) < 0) {
goto Error;
}
newformat = _PyUnicodeWriter_Finish(&writer);
if (newformat == NULL) {
goto Done;
format = PyUnicode_FromString(PyBytes_AS_STRING(newfmt));
if (format != NULL) {
result = PyObject_CallFunctionObjArgs(strftime,
format, timetuple, NULL);
Py_DECREF(format);
}
Py_DECREF(strftime);
}
result = PyObject_CallFunctionObjArgs(strftime,
newformat, timetuple, NULL);
Py_DECREF(newformat);

Done:
Py_XDECREF(freplacement);
Py_XDECREF(zreplacement);
Py_XDECREF(colonzreplacement);
Py_XDECREF(Zreplacement);
Py_XDECREF(newfmt);
Py_XDECREF(strftime);
return result;

Error:
_PyUnicodeWriter_Dealloc(&writer);
goto Done;
}

/* ---------------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 13ef787

Please sign in to comment.