Skip to content

Commit

Permalink
Merge branch 'main' of github.com:pola-rs/polars into low-memory-docs…
Browse files Browse the repository at this point in the history
…trings
  • Loading branch information
Liam Brannigan committed Oct 9, 2023
2 parents 05e9057 + 1928b82 commit 0661da2
Show file tree
Hide file tree
Showing 92 changed files with 1,549 additions and 892 deletions.
21 changes: 21 additions & 0 deletions .github/release-drafter-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,24 @@ version-resolver:
- breaking
- breaking python
default: patch

categories:
- title: 🏆 Highlights
labels: highlight
- title: 💥 Breaking changes
labels:
- breaking
- breaking python
- title: ⚠️ Deprecations
labels: deprecation
- title: 🚀 Performance improvements
labels: performance
- title: ✨ Enhancements
labels: enhancement
- title: 🐞 Bug fixes
labels: fix
- title: 🛠️ Other improvements
labels:
- build
- documentation
- internal
20 changes: 20 additions & 0 deletions .github/release-drafter-rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,23 @@ version-resolver:
- breaking
- breaking rust
default: patch

categories:
- title: 🏆 Highlights
labels: highlight
- title: 💥 Breaking changes
labels:
- breaking
- breaking rust
- title: 🚀 Performance improvements
labels: performance
- title: ✨ Enhancements
labels: enhancement
- title: 🐞 Bug fixes
labels: fix
- title: 🛠️ Other improvements
labels:
- build
- deprecation
- documentation
- internal
19 changes: 0 additions & 19 deletions .github/release-drafter.yml
Original file line number Diff line number Diff line change
@@ -1,22 +1,3 @@
categories:
- title: 🏆 Highlights
labels: highlight
- title: 💥 Breaking changes
labels: breaking
- title: ⚠️ Deprecations
labels: deprecation
- title: 🚀 Performance improvements
labels: performance
- title: ✨ Enhancements
labels: enhancement
- title: 🐞 Bug fixes
labels: fix
- title: 🛠️ Other improvements
labels:
- build
- documentation
- internal

exclude-labels:
- skip changelog
- release
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -498,9 +498,6 @@ where
if items.len() > 1 {
return MaybeNext::Some(Ok(items.pop_front().unwrap()));
}
if (items.len() == 1) && items.front().unwrap().0.len() == chunk_size.unwrap_or(usize::MAX) {
return MaybeNext::Some(Ok(items.pop_front().unwrap()));
}
if *remaining == 0 {
return match items.pop_front() {
Some(decoded) => MaybeNext::Some(Ok(decoded)),
Expand Down Expand Up @@ -541,11 +538,11 @@ where
};

if (items.len() == 1)
&& items.front().unwrap().0.len() < chunk_size.unwrap_or(usize::MAX)
&& items.front().unwrap().0.len() > chunk_size.unwrap_or(usize::MAX)
{
MaybeNext::More
} else {
MaybeNext::Some(Ok(items.pop_front().unwrap()))
} else {
MaybeNext::More
}
},
}
Expand Down
72 changes: 57 additions & 15 deletions crates/nano-arrow/src/io/parquet/read/schema/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -343,12 +343,12 @@ fn to_list(
let field = fields.first().unwrap();
(
&field.get_field_info().name,
field.get_field_info().repetition != Repetition::Required,
field.get_field_info().repetition == Repetition::Optional,
)
},
_ => (
&item.get_field_info().name,
item.get_field_info().repetition != Repetition::Required,
item.get_field_info().repetition == Repetition::Optional,
),
};

Expand Down Expand Up @@ -596,7 +596,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("element", DataType::Utf8, true))),
DataType::List(Box::new(Field::new("element", DataType::Utf8, false))),
true,
));
}
Expand All @@ -608,7 +608,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("element", DataType::Int32, true))),
DataType::List(Box::new(Field::new("element", DataType::Int32, false))),
true,
));
}
Expand All @@ -627,7 +627,7 @@ mod tests {
]);
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("element", arrow_struct, true))),
DataType::List(Box::new(Field::new("element", arrow_struct, false))),
true,
));
}
Expand All @@ -643,7 +643,7 @@ mod tests {
let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("array", arrow_struct, true))),
DataType::List(Box::new(Field::new("array", arrow_struct, false))),
true,
));
}
Expand All @@ -659,7 +659,7 @@ mod tests {
let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, true))),
DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, false))),
true,
));
}
Expand All @@ -669,8 +669,50 @@ mod tests {
{
arrow_fields.push(Field::new(
"name",
DataType::List(Box::new(Field::new("name", DataType::Int32, true))),
true,
DataType::List(Box::new(Field::new("name", DataType::Int32, false))),
false,
));
}

let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
let fields = parquet_to_arrow_schema(parquet_schema.fields());

assert_eq!(arrow_fields, fields);
Ok(())
}

#[test]
fn test_parquet_list_with_struct() -> Result<()> {
let mut arrow_fields = Vec::new();

let message_type = "
message eventlog {
REQUIRED group events (LIST) {
REPEATED group array {
REQUIRED BYTE_ARRAY event_name (STRING);
REQUIRED INT64 event_time (TIMESTAMP(MILLIS,true));
}
}
}
";

{
let struct_fields = vec![
Field::new("event_name", DataType::Utf8, false),
Field::new(
"event_time",
DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
false,
),
];
arrow_fields.push(Field::new(
"events",
DataType::List(Box::new(Field::new(
"array",
DataType::Struct(struct_fields),
false,
))),
false,
));
}

Expand Down Expand Up @@ -797,9 +839,9 @@ mod tests {
DataType::List(Box::new(Field::new(
"innerGroup",
DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]),
true,
false,
))),
true,
false,
);

let outer_group_list = Field::new(
Expand All @@ -810,9 +852,9 @@ mod tests {
Field::new("leaf2", DataType::Int32, true),
inner_group_list,
]),
true,
false,
))),
true,
false,
);
arrow_fields.push(outer_group_list);
}
Expand Down Expand Up @@ -873,8 +915,8 @@ mod tests {
Field::new("string", DataType::Utf8, true),
Field::new(
"bools",
DataType::List(Box::new(Field::new("bools", DataType::Boolean, true))),
true,
DataType::List(Box::new(Field::new("bools", DataType::Boolean, false))),
false,
),
Field::new("date", DataType::Date32, true),
Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),
Expand Down
13 changes: 9 additions & 4 deletions crates/polars-core/src/frame/group_by/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,21 @@ impl DataFrame {
!by.is_empty(),
ComputeError: "at least one key is required in a group_by operation"
);
let by_len = by[0].len();
let minimal_by_len = by.iter().map(|s| s.len()).min().expect("at least 1 key");
let df_height = self.height();

// we only throw this error if self.width > 0
// so that we can still call this on a dummy dataframe where we provide the keys
if (by_len != self.height()) && (self.width() > 0) {
if (minimal_by_len != df_height) && (self.width() > 0) {
polars_ensure!(
by_len == 1,
minimal_by_len == 1,
ShapeMismatch: "series used as keys should have the same length as the dataframe"
);
by[0] = by[0].new_from_index(0, self.height())
for by_key in by.iter_mut() {
if by_key.len() == minimal_by_len {
*by_key = by_key.new_from_index(0, df_height)
}
}
};

let n_partitions = _set_partition_size();
Expand Down
21 changes: 9 additions & 12 deletions crates/polars-core/src/frame/group_by/proxy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -367,19 +367,16 @@ impl GroupsProxy {
}
}

pub fn take_group_lasts(self) -> Vec<IdxSize> {
/// # Safety
/// This will not do any bounds checks. The caller must ensure
/// all groups have members.
pub unsafe fn take_group_lasts(self) -> Vec<IdxSize> {
match self {
GroupsProxy::Idx(groups) => {
groups
.all
.iter()
.map(|idx| {
// safety:
// idx has at least one eletment, so -1 is always in bounds
unsafe { *idx.get_unchecked(idx.len() - 1) }
})
.collect()
},
GroupsProxy::Idx(groups) => groups
.all
.iter()
.map(|idx| *idx.get_unchecked(idx.len() - 1))
.collect(),
GroupsProxy::Slice { groups, .. } => groups
.into_iter()
.map(|[first, len]| first + len - 1)
Expand Down
6 changes: 3 additions & 3 deletions crates/polars-core/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ where
/// Concat [`DataFrame`]s horizontally.
#[cfg(feature = "horizontal_concat")]
/// Concat horizontally and extend with null values if lengths don't match
pub fn hor_concat_df(dfs: &[DataFrame]) -> PolarsResult<DataFrame> {
pub fn concat_df_horizontal(dfs: &[DataFrame]) -> PolarsResult<DataFrame> {
let max_len = dfs
.iter()
.map(|df| df.height())
Expand Down Expand Up @@ -98,7 +98,7 @@ pub fn hor_concat_df(dfs: &[DataFrame]) -> PolarsResult<DataFrame> {
/// Concat [`DataFrame`]s diagonally.
#[cfg(feature = "diagonal_concat")]
/// Concat diagonally thereby combining different schemas.
pub fn diag_concat_df(dfs: &[DataFrame]) -> PolarsResult<DataFrame> {
pub fn concat_df_diagonal(dfs: &[DataFrame]) -> PolarsResult<DataFrame> {
// TODO! replace with lazy only?
let upper_bound_width = dfs.iter().map(|df| df.width()).sum();
let mut column_names = AHashSet::with_capacity(upper_bound_width);
Expand Down Expand Up @@ -175,7 +175,7 @@ mod test {
"d" => [1, 2]
]?;

let out = diag_concat_df(&[a, b, c])?;
let out = concat_df_diagonal(&[a, b, c])?;

let expected = df![
"a" => [Some(1), Some(2), None, None, Some(5), Some(7)],
Expand Down
Loading

0 comments on commit 0661da2

Please sign in to comment.