Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(python): use rust to convert to/from python datetimes #20660

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ once_cell = "1"
parking_lot = "0.12"
percent-encoding = "2.3"
pin-project-lite = "0.2"
pyo3 = { git = "https://github.com/bschoenmaeckers/pyo3.git", branch = "release-0.23" }
pyo3 = { git = "https://github.com/pyo3/pyo3.git", branch = "release-0.23.4" }
rand = "0.8"
rand_distr = "0.4"
raw-cpuid = "11"
Expand Down Expand Up @@ -136,8 +136,8 @@ features = [
[patch.crates-io]
# packed_simd_2 = { git = "https://github.com/rust-lang/packed_simd", rev = "e57c7ba11386147e6d2cbad7c88f376aab4bdc86" }
# simd-json = { git = "https://github.com/ritchie46/simd-json", branch = "alignment" }
pyo3 = { git = "https://github.com/bschoenmaeckers/pyo3.git", branch = "release-0.23" }
pyo3-ffi = { git = "https://github.com/bschoenmaeckers/pyo3.git", branch = "release-0.23" }
pyo3 = { git = "https://github.com/pyo3/pyo3.git", branch = "release-0.23.4" }
pyo3-ffi = { git = "https://github.com/pyo3/pyo3.git", branch = "release-0.23.4" }

[profile.mindebug-dev]
inherits = "dev"
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ arboard = { workspace = true, optional = true }
bincode = { workspace = true }
bytemuck = { workspace = true }
bytes = { workspace = true }
chrono-tz = { workspace = true }
either = { workspace = true }
flate2 = { workspace = true }
itoa = { workspace = true }
Expand All @@ -38,7 +39,7 @@ ndarray = { workspace = true }
num-traits = { workspace = true }
numpy = { workspace = true }
once_cell = { workspace = true }
pyo3 = { workspace = true, features = ["abi3-py39", "chrono", "multiple-pymethods"] }
pyo3 = { workspace = true, features = ["abi3-py39", "chrono", "chrono-tz", "multiple-pymethods"] }
recursive = { workspace = true }
serde_json = { workspace = true, optional = true }
thiserror = { workspace = true }
Expand Down
84 changes: 39 additions & 45 deletions crates/polars-python/src/conversion/any_value.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
use std::borrow::{Borrow, Cow};

use chrono_tz::Tz;
#[cfg(feature = "object")]
use polars::chunked_array::object::PolarsObjectSafe;
#[cfg(feature = "object")]
use polars::datatypes::OwnedObject;
use polars::datatypes::{DataType, Field, PlHashMap, TimeUnit};
use polars::prelude::{AnyValue, PlSmallStr, Series, TimeZone};
use polars_core::export::chrono::{NaiveDate, NaiveDateTime, NaiveTime, TimeDelta, Timelike};
use polars::export::chrono::{DateTime, FixedOffset};
use polars::prelude::{AnyValue, PlSmallStr, Series};
use polars_core::export::chrono::{
Datelike, NaiveDate, NaiveDateTime, NaiveTime, TimeDelta, Timelike,
};
use polars_core::utils::any_values_to_supertype_and_n_dtypes;
use polars_core::utils::arrow::temporal_conversions::date32_to_date;
use pyo3::exceptions::{PyOverflowError, PyTypeError, PyValueError};
Expand All @@ -17,7 +21,7 @@ use pyo3::types::{
use pyo3::{intern, IntoPyObjectExt};

use super::datetime::{
elapsed_offset_to_timedelta, nanos_since_midnight_to_naivetime, timestamp_to_naive_datetime,
datetime_to_py_object, elapsed_offset_to_timedelta, nanos_since_midnight_to_naivetime,
};
use super::{decimal_to_digits, struct_dict, ObjectValue, Wrap};
use crate::error::PyPolarsErr;
Expand Down Expand Up @@ -92,15 +96,11 @@ pub(crate) fn any_value_into_py_object<'py>(
date.into_bound_py_any(py)
},
AnyValue::Datetime(v, time_unit, time_zone) => {
datetime_to_py_object(py, utils, v, time_unit, time_zone)
datetime_to_py_object(py, v, time_unit, time_zone)
},
AnyValue::DatetimeOwned(v, time_unit, time_zone) => {
datetime_to_py_object(py, v, time_unit, time_zone.as_ref().map(AsRef::as_ref))
},
AnyValue::DatetimeOwned(v, time_unit, time_zone) => datetime_to_py_object(
py,
utils,
v,
time_unit,
time_zone.as_ref().map(AsRef::as_ref),
),
AnyValue::Duration(v, time_unit) => {
let time_delta = elapsed_offset_to_timedelta(v, time_unit);
time_delta.into_bound_py_any(py)
Expand Down Expand Up @@ -142,28 +142,6 @@ pub(crate) fn any_value_into_py_object<'py>(
}
}

fn datetime_to_py_object<'py>(
py: Python<'py>,
utils: &Bound<'py, PyAny>,
v: i64,
tu: TimeUnit,
tz: Option<&TimeZone>,
) -> PyResult<Bound<'py, PyAny>> {
if let Some(time_zone) = tz {
// When https://github.com/pola-rs/polars/issues/16199 is
// implemented, we'll switch to something like:
//
// let tz: chrono_tz::Tz = time_zone.parse().unwrap();
// let datetime = tz.from_local_datetime(&naive_datetime).earliest().unwrap();
// datetime.into_py(py)
let convert = utils.getattr(intern!(py, "to_py_datetime"))?;
let time_unit = tu.to_ascii();
convert.call1((v, time_unit, time_zone.as_str()))
} else {
timestamp_to_naive_datetime(v, tu).into_pyobject(py)
}
}

/// Holds a Python type object and implements hashing / equality based on the pointer address of the
/// type object. This is used as a hashtable key instead of only the `usize` pointer value, as we
/// need to hold a ref to the Python type object to keep it alive.
Expand Down Expand Up @@ -273,18 +251,34 @@ pub(crate) fn py_object_to_any_value<'py>(
}

fn get_datetime(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
// Probably needs to wait for
// https://github.com/pola-rs/polars/issues/16199 to do it a faster way.
Python::with_gil(|py| {
let date = pl_utils(py)
.bind(py)
.getattr(intern!(py, "datetime_to_int"))
.unwrap()
.call1((ob, intern!(py, "us")))
.unwrap();
let v = date.extract::<i64>()?;
Ok(AnyValue::Datetime(v, TimeUnit::Microseconds, None))
})
let py = ob.py();
let tzinfo = ob.getattr(intern!(py, "tzinfo"))?;

let timestamp = if tzinfo.is_none() {
let datetime = ob.extract::<NaiveDateTime>()?;
let delta = datetime - NaiveDateTime::UNIX_EPOCH;
delta.num_microseconds().unwrap()
} else if tzinfo.hasattr(intern!(py, "key"))? {
let datetime = ob.extract::<DateTime<Tz>>()?;
if datetime.year() >= 2100 {
// chrono-tz does not support dates after 2100
// https://github.com/chronotope/chrono-tz/issues/135
pl_utils(py)
.bind(py)
.getattr(intern!(py, "datetime_to_int"))?
.call1((ob, intern!(py, "us")))?
.extract::<i64>()?
} else {
let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;
delta.num_microseconds().unwrap()
}
} else {
let datetime = ob.extract::<DateTime<FixedOffset>>()?;
let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;
delta.num_microseconds().unwrap()
};

Ok(AnyValue::Datetime(timestamp, TimeUnit::Microseconds, None))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this return the timezone info instead of None in the cases where there is one?

}

fn get_timedelta(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {
Expand Down
29 changes: 7 additions & 22 deletions crates/polars-python/src/conversion/chunked_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use pyo3::types::{PyBytes, PyList, PyNone, PyTuple};
use pyo3::{intern, BoundObject};

use super::datetime::{
elapsed_offset_to_timedelta, nanos_since_midnight_to_naivetime, timestamp_to_naive_datetime,
datetime_to_py_object, elapsed_offset_to_timedelta, nanos_since_midnight_to_naivetime,
};
use super::{decimal_to_digits, struct_dict};
use crate::prelude::*;
Expand Down Expand Up @@ -78,27 +78,12 @@ impl<'py> IntoPyObject<'py> for &Wrap<&DatetimeChunked> {
type Error = PyErr;

fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
let time_zone = self.0.time_zone();
if time_zone.is_some() {
// Switch to more efficient code path in
// https://github.com/pola-rs/polars/issues/16199
let utils = pl_utils(py).bind(py);
let convert = utils.getattr(intern!(py, "to_py_datetime"))?;
let time_unit = self.0.time_unit().to_ascii();
let time_zone = time_zone.as_deref().into_pyobject(py)?;
let iter = self
.0
.iter()
.map(|opt_v| opt_v.map(|v| convert.call1((v, time_unit, &time_zone)).unwrap()));
PyList::new(py, iter)
} else {
let time_unit = self.0.time_unit();
let iter = self
.0
.iter()
.map(|opt_v| opt_v.map(|v| timestamp_to_naive_datetime(v, time_unit)));
PyList::new(py, iter)
}
let time_zone = self.0.time_zone().as_ref();
let time_unit = self.0.time_unit();
let iter = self.0.iter().map(|opt_v| {
opt_v.map(|v| datetime_to_py_object(py, v, time_unit, time_zone).unwrap())
});
PyList::new(py, iter)
}
}

Expand Down
34 changes: 33 additions & 1 deletion crates/polars-python/src/conversion/datetime.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
//! Utilities for converting dates, times, datetimes, and so on.

use std::str::FromStr;

use chrono_tz::Tz;
use polars::datatypes::TimeUnit;
use polars_core::export::chrono::{NaiveDateTime, NaiveTime, TimeDelta};
use polars_core::datatypes::TimeZone;
use polars_core::export::chrono::{
DateTime, FixedOffset, NaiveDateTime, NaiveTime, TimeDelta, TimeZone as _,
};
use pyo3::{Bound, IntoPyObject, PyAny, PyResult, Python};

use crate::error::PyPolarsErr;

pub fn elapsed_offset_to_timedelta(elapsed: i64, time_unit: TimeUnit) -> TimeDelta {
let (in_second, nano_multiplier) = match time_unit {
Expand Down Expand Up @@ -29,3 +38,26 @@ pub fn nanos_since_midnight_to_naivetime(nanos_since_midnight: i64) -> NaiveTime
NaiveTime::from_hms_opt(0, 0, 0).unwrap()
+ elapsed_offset_to_timedelta(nanos_since_midnight, TimeUnit::Nanoseconds)
}

pub fn datetime_to_py_object<'py>(
py: Python<'py>,
v: i64,
tu: TimeUnit,
tz: Option<&TimeZone>,
) -> PyResult<Bound<'py, PyAny>> {
if let Some(time_zone) = tz {
if let Ok(tz) = Tz::from_str(time_zone) {
let utc_datetime = DateTime::UNIX_EPOCH + elapsed_offset_to_timedelta(v, tu);
let datetime = utc_datetime.with_timezone(&tz);
datetime.into_pyobject(py)
} else if let Ok(tz) = FixedOffset::from_str(time_zone) {
let naive_datetime = timestamp_to_naive_datetime(v, tu);
let datetime = tz.from_utc_datetime(&naive_datetime);
datetime.into_pyobject(py)
} else {
Err(PyPolarsErr::Other(format!("Could not parse timezone: {time_zone}")).into())
}
} else {
timestamp_to_naive_datetime(v, tu).into_pyobject(py)
}
}
Loading