Skip to content

Commit

Permalink
refactor(df-repr): rm unionfind dependency (#228)
Browse files Browse the repository at this point in the history
The current union find is our fork with `unsafe impl Send + Sync` on a
`Rc` stored inside the union find, which obviously doesn't seem safe, so
I rewrote it. The union find we implemented in theory is serializable by
deriving `Serialize`.

Signed-off-by: Alex Chi <iskyzh@gmail.com>
  • Loading branch information
skyzh authored Nov 8, 2024
1 parent 3d81e65 commit 4f84645
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 12 deletions.
6 changes: 0 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion optd-datafusion-repr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,4 @@ camelpaste = "0.1"
datafusion-expr = "32.0.0"
serde = { version = "1.0", features = ["derive"] }
bincode = "1.3.3"
union-find = { git = "https://github.com/Gun9niR/union-find-rs.git", rev = "794821514f7daefcbb8d5f38ef04e62fc18b5665" }
value-bag = { version = "1", features = ["owned"] }
1 change: 1 addition & 0 deletions optd-datafusion-repr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ mod optimizer_ext;
pub mod plan_nodes;
pub mod properties;
pub mod rules;
mod utils;

#[cfg(test)]
mod testing;
Expand Down
11 changes: 6 additions & 5 deletions optd-datafusion-repr/src/properties/column_ref.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ use std::sync::Arc;
use anyhow::anyhow;
use itertools::Itertools;
use optd_core::property::PropertyBuilder;
use union_find::disjoint_sets::DisjointSets;
use union_find::union_find::UnionFind;

use super::schema::Catalog;
use super::DEFAULT_NAME;
use crate::plan_nodes::{
decode_empty_relation_schema, ArcDfPredNode, BinOpType, ConstantPred, DfNodeType, DfPredType,
DfReprPredNode, JoinType, LogOpType,
use crate::{
plan_nodes::{
decode_empty_relation_schema, ArcDfPredNode, BinOpType, ConstantPred, DfNodeType,
DfPredType, DfReprPredNode, JoinType, LogOpType,
},
utils::DisjointSets,
};

pub type BaseTableColumnRefs = Vec<ColumnRef>;
Expand Down
123 changes: 123 additions & 0 deletions optd-datafusion-repr/src/utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// Copyright (c) 2023-2024 CMU Database Group
//
// Use of this source code is governed by an MIT-style license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT.

//! optd's implementation of disjoint sets (union finds). It's send + sync + serializable.
use std::{collections::HashMap, hash::Hash};
#[derive(Clone, Default)]
pub struct DisjointSets<T: Clone> {
data_idx: HashMap<T, usize>,
parents: Vec<usize>,
}

impl<T: Clone> std::fmt::Debug for DisjointSets<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "DisjointSets")
}
}

impl<T: Clone + Eq + PartialEq + Hash> DisjointSets<T> {
pub fn new() -> Self {
Self {
data_idx: HashMap::new(),
parents: Vec::new(),
}
}

pub fn contains(&self, data: &T) -> bool {
self.data_idx.contains_key(data)
}

#[must_use]
pub fn make_set(&mut self, data: T) -> Option<()> {
if self.data_idx.contains_key(&data) {
return None;
}
let idx = self.parents.len();
self.data_idx.insert(data.clone(), idx);
self.parents.push(idx);
Some(())
}

fn find(&mut self, mut idx: usize) -> usize {
while self.parents[idx] != idx {
self.parents[idx] = self.parents[self.parents[idx]];
idx = self.parents[idx];
}
idx
}

fn find_const(&self, mut idx: usize) -> usize {
while self.parents[idx] != idx {
idx = self.parents[idx];
}
idx
}

#[must_use]
pub fn union(&mut self, data1: &T, data2: &T) -> Option<()> {
let idx1 = *self.data_idx.get(data1)?;
let idx2 = *self.data_idx.get(data2)?;
let parent1 = self.find(idx1);
let parent2 = self.find(idx2);
if parent1 != parent2 {
self.parents[parent1] = parent2;
}
Some(())
}

pub fn same_set(&self, data1: &T, data2: &T) -> Option<bool> {
let idx1 = *self.data_idx.get(data1)?;
let idx2 = *self.data_idx.get(data2)?;
Some(self.find_const(idx1) == self.find_const(idx2))
}

pub fn set_size(&self, data: &T) -> Option<usize> {
let idx = *self.data_idx.get(data)?;
let parent = self.find_const(idx);
Some(
self.parents
.iter()
.filter(|&&x| self.find_const(x) == parent)
.count(),
)
}
}

#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_union_find() {
let mut set = DisjointSets::new();
set.make_set("a").unwrap();
set.make_set("b").unwrap();
set.make_set("c").unwrap();
set.make_set("d").unwrap();
set.make_set("e").unwrap();
assert!(set.same_set(&"a", &"a").unwrap());
assert!(!set.same_set(&"a", &"b").unwrap());
assert_eq!(set.set_size(&"a").unwrap(), 1);
assert_eq!(set.set_size(&"c").unwrap(), 1);
set.union(&"a", &"b").unwrap();
assert_eq!(set.set_size(&"a").unwrap(), 2);
assert_eq!(set.set_size(&"c").unwrap(), 1);
assert!(set.same_set(&"a", &"b").unwrap());
assert!(!set.same_set(&"a", &"c").unwrap());
set.union(&"b", &"c").unwrap();
assert!(set.same_set(&"a", &"c").unwrap());
assert!(!set.same_set(&"a", &"d").unwrap());
assert_eq!(set.set_size(&"a").unwrap(), 3);
assert_eq!(set.set_size(&"d").unwrap(), 1);
set.union(&"d", &"e").unwrap();
assert!(set.same_set(&"d", &"e").unwrap());
assert!(!set.same_set(&"a", &"d").unwrap());
assert_eq!(set.set_size(&"a").unwrap(), 3);
assert_eq!(set.set_size(&"d").unwrap(), 2);
set.union(&"c", &"e").unwrap();
assert!(set.same_set(&"a", &"e").unwrap());
assert_eq!(set.set_size(&"d").unwrap(), 5);
}
}

0 comments on commit 4f84645

Please sign in to comment.