From fa719a59248430e5ceb3d9b2b2d3c988b105ed72 Mon Sep 17 00:00:00 2001 From: angelip2303 Date: Wed, 28 Jun 2023 13:27:40 +0000 Subject: [PATCH] working on improvements --- Cargo.toml | 2 +- README.md | 10 ++- examples/cardinality/main.rs | 160 +++++++++++++++-------------------- examples/wikidata_dump.rs | 2 +- src/pschema.rs | 43 +++++++--- src/shape/shape_tree.rs | 5 ++ src/shape/shex.rs | 20 +++-- src/utils/examples.rs | 23 ++++- 8 files changed, 148 insertions(+), 117 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b2e4dba..cd96283 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ keywords = ["pregel", "wikidata", "subsetting", "duckdb", "validation"] categories = ["algorithms", "database", "mathematics", "science"] [dependencies] -pregel-rs = { version = "0.0.13" } +pregel-rs = { path="../pregel-rs" } wikidata-rs = { version = "0.0.4" } polars = { version = "0.30.0", features = ["lazy", "is_in", "performant", "parquet", "chunked_ids", "list_eval", "dtype-categorical", "rows"] } duckdb = { version = "0.7.1" } diff --git a/README.md b/README.md index 11296ac..802d246 100644 --- a/README.md +++ b/README.md @@ -35,12 +35,18 @@ To use `pschema-rs` in your Rust project, you can add it as a dependency in your ```toml [dependencies] -pschema = "0.0.3" +pschema = "0.0.4" ``` ## Usage -Here's an example of how you can use `pschema-rs` to perform schema validation and generate a subset of data from Wikidata: +Here's an example of how you can use `pschema-rs` to perform schema validation and generate a subset of data from Wikidata. +Note that what we are doing here is first, defining the `ShapeExpression` we want the algorithm to validate. Next, we import +the Wikidata entities from a file. Note that the import methods we have defined create an edge DataFrame, and as such, we +need to call to the function `GraphFrame::from_edges(edges)`, which will build the GraphFrame from the imported edges. Lastly, +by calling `PSchema::new(start).validate(graph)`, we will both construct the `PSchema` algorithm provided the `ShapeExpression` +we have defined, first, and create the subset of the graph, second. Then, we print the results. Note that we can also export +the results to a file. See the [examples](https://github.com/angelip2303/pschema-rs/tree/main/examples) for more information. ```rust use pregel_rs::graph_frame::GraphFrame; diff --git a/examples/cardinality/main.rs b/examples/cardinality/main.rs index 5891504..81fde20 100644 --- a/examples/cardinality/main.rs +++ b/examples/cardinality/main.rs @@ -24,110 +24,84 @@ static GLOBAL: MiMalloc = MiMalloc; fn main() -> Result<(), String> { // Define validation rules - let shape = Cardinality::new( - ShapeOr::new( - "type", + let shape = ShapeReference::new( + "protein", + "", + ShapeAnd::new( + "annotation", vec![ - TripleConstraint::new( - "Transmembrane_Annotation", - "", - NodeConstraint::Value( - "", - ), + ShapeReference::new( + "range", + "", + ShapeAnd::new( + "grouping", + vec![ + ShapeReference::new( + "lower_range", + "", + TripleConstraint::new( + "begin", + "", + NodeConstraint::Any, + ) + .into(), + ) + .into(), + ShapeReference::new( + "upper_range", + "", + TripleConstraint::new( + "end", + "", + NodeConstraint::Any, + ) + .into(), + ) + .into(), + ], + ) + .into(), ) .into(), TripleConstraint::new( - "Transmembrane_Annotation", - "", - NodeConstraint::Value( - "", - ), + "comment", + "", + NodeConstraint::Any, + ) + .into(), + Cardinality::new( + "cardinality", + ShapeOr::new( + "type", + vec![ + TripleConstraint::new( + "Transmembrane_Annotation", + "", + NodeConstraint::Value( + "", + ), + ) + .into(), + TripleConstraint::new( + "Transmembrane_Annotation", + "", + NodeConstraint::Value( + "", + ), + ) + .into(), + ], + ) + .into(), + Bound::Zero, + Bound::Many, ) .into(), ], ) .into(), - Bound::Zero, - Bound::Many, ) .into(); - // let shape = ShapeReference::new( - // "protein", - // "", - // ShapeAnd::new( - // "annotation", - // vec![ - // ShapeReference::new( - // "range", - // "", - // ShapeAnd::new( - // "grouping", - // vec![ - // ShapeReference::new( - // "lower_range", - // "", - // TripleConstraint::new( - // "begin", - // "", - // NodeConstraint::Any, - // ) - // .into(), - // ) - // .into(), - // ShapeReference::new( - // "upper_range", - // "", - // TripleConstraint::new( - // "end", - // "", - // NodeConstraint::Any, - // ) - // .into(), - // ) - // .into(), - // ], - // ) - // .into(), - // ) - // .into(), - // TripleConstraint::new( - // "comment", - // "", - // NodeConstraint::Any, - // ) - // .into(), - // Cardinality::new( - // ShapeOr::new( - // "type", - // vec![ - // TripleConstraint::new( - // "Transmembrane_Annotation", - // "", - // NodeConstraint::Value( - // "", - // ), - // ) - // .into(), - // TripleConstraint::new( - // "Transmembrane_Annotation", - // "", - // NodeConstraint::Value( - // "", - // ), - // ) - // .into(), - // ], - // ) - // .into(), - // Bound::Zero, - // Bound::Many, - // ) - // .into(), - // ], - // ) - // .into(), - // ) - // .into(); // Load Wikidata entities let edges = NTriples::import("uniprotkb_reviewed_viruses_10239_0.nt")?; diff --git a/examples/wikidata_dump.rs b/examples/wikidata_dump.rs index 24a69b8..77783c2 100644 --- a/examples/wikidata_dump.rs +++ b/examples/wikidata_dump.rs @@ -32,7 +32,7 @@ fn main() -> Result<(), String> { )); // Load Wikidata entities - let edges = match DuckDB::import("../wd2duckdb/wikidata-20170821-all.duckdb") { + let edges = match DuckDB::import("wikidata-20170821-all.duckdb") { Ok(edges) => edges, Err(_) => return Err(String::from("Error creating the edges :(")), }; diff --git a/src/pschema.rs b/src/pschema.rs index a6cacf8..8c2e900 100644 --- a/src/pschema.rs +++ b/src/pschema.rs @@ -2,6 +2,7 @@ use crate::shape::shape_tree::{ShapeTree, ShapeTreeItem}; use crate::shape::shex::{Shape, Validate}; use crate::utils::check::check_field; +use polars::enable_string_cache; use polars::prelude::*; use pregel_rs::graph_frame::GraphFrame; use pregel_rs::pregel::{Column, MessageReceiver, PregelBuilder}; @@ -62,6 +63,7 @@ impl PSchema { /// there is an error during execution, it returns an `Err(PolarsError)` with a /// description of the error. pub fn validate(self, graph: GraphFrame) -> PolarsResult { + enable_string_cache(true); // First, we check if the graph has the required columns. If the graph does not have the // required columns or in case they are empty, we return an error. The required columns are: // - `subject`: the source vertex of the edge @@ -96,7 +98,12 @@ impl PSchema { .lazy() .select(&[ col(Column::VertexId.as_ref()), - col(Column::Custom("labels").as_ref()), + col(Column::Custom("labels").as_ref()) + .explode() + .drop_nulls() + .unique() + .implode() + .over([Column::VertexId.as_ref()]), ]) .filter( col(Column::Custom("labels").as_ref()) @@ -143,7 +150,13 @@ impl PSchema { } } } - ans.cast(DataType::Categorical(None)) + match concat_list([ + Column::subject(Column::Custom("labels")), + ans.cast(DataType::Categorical(None)), + ]) { + Ok(concat) => concat, + Err(_) => Column::subject(Column::Custom("labels")), + } } /// The function returns an expression that aggregates messages by exploding a @@ -156,7 +169,7 @@ impl PSchema { /// element in the column), and drops any rows that have NULL values in the /// resulting column. fn aggregate_messages() -> Expr { - Column::msg(None).filter(Column::msg(None).is_not_null()) + Column::msg(None).explode() } /// The function takes a shape iterator, validates the shapes in it, concatenates @@ -191,10 +204,10 @@ impl PSchema { #[cfg(test)] mod tests { use crate::pschema::PSchema; + use crate::shape::shex::Shape; use crate::utils::examples::Value::*; use crate::utils::examples::*; - use crate::shape::shex::Shape; use polars::df; use polars::prelude::*; use pregel_rs::graph_frame::GraphFrame; @@ -210,6 +223,7 @@ mod tests { .select([col("labels").list().lengths()]) .collect() .unwrap(); + println!("count: {:?}", count); match count == expected { true => Ok(()), false => return Err(String::from("The DataFrames are not equals")), @@ -225,18 +239,20 @@ mod tests { Ok(graph) => graph, Err(error) => return Err(error), }; - let expected = match DataFrame::new(vec![Series::new(Custom("labels").as_ref(), result)]) { Ok(expected) => expected, Err(_) => return Err(String::from("Error creating the expected DataFrame")), }; - match PSchema::new(schema).validate(graph) { Ok(actual) => { println!("actual: {:?}", actual); assert(expected, actual) } - Err(error) => Err(error.to_string()), + Err(error) => { + println!("asd"); + println!("{}", error); + Err(error.to_string()) + } } } @@ -252,17 +268,17 @@ mod tests { #[test] fn complex_test() -> Result<(), String> { - test(paper_graph(), vec![4u32, 1u32], complex_schema()) + test(paper_graph(), vec![4u32, 1u32, 1u32], complex_schema()) } #[test] fn reference_test() -> Result<(), String> { - test(paper_graph(), vec![1u32], reference_schema()) + test(paper_graph(), vec![2u32, 1u32, 1u32], reference_schema()) } #[test] fn optional_test() -> Result<(), String> { - test(paper_graph(), vec![1u32, 1u32], optional_schema()) + test(paper_graph(), vec![3u32, 1u32, 1u32], optional_schema()) } #[test] @@ -277,7 +293,12 @@ mod tests { #[test] fn cardinality_test() -> Result<(), String> { - test(paper_graph(), vec![1u32, 1u32], cardinality_schema()) + test(paper_graph(), vec![3u32, 1u32], cardinality_schema()) + } + + #[test] + fn vprog_to_vprog_test() -> Result<(), String> { + test(paper_graph(), vec![3u32], vprog_to_vprog()) } #[test] diff --git a/src/shape/shape_tree.rs b/src/shape/shape_tree.rs index 3b1e9f1..2a5d22a 100644 --- a/src/shape/shape_tree.rs +++ b/src/shape/shape_tree.rs @@ -165,4 +165,9 @@ pub mod tests { fn optional_schema_test() { assert_eq!(3, ShapeTree::new(optional_schema()).into_iter().count()) } + + #[test] + fn v_prog_to_vprog_schema_test() { + assert_eq!(3, ShapeTree::new(vprog_to_vprog()).into_iter().count()) + } } diff --git a/src/shape/shex.rs b/src/shape/shex.rs index 32f6fb7..686bb1f 100644 --- a/src/shape/shex.rs +++ b/src/shape/shex.rs @@ -146,6 +146,7 @@ pub struct ShapeOr { /// `Finite(usize)` to represent a specific number #[derive(Clone, Debug, PartialEq)] pub struct Cardinality { + label: &'static str, shape: Shape, min: Bound, max: Bound, @@ -454,8 +455,13 @@ impl Cardinality { /// The `new` function is returning an instance of the struct that it is defined in. /// The type of the returned value is `Self`, which in this case refers to the /// struct that the `new` function is defined in. - pub fn new(shape: Shape, min: Bound, max: Bound) -> Self { - Self { shape, min, max } + pub fn new(label: &'static str, shape: Shape, min: Bound, max: Bound) -> Self { + Self { + label, + shape, + min, + max, + } } /// This Rust function returns the shape of an object. @@ -506,12 +512,10 @@ impl Validate for Cardinality { Bound::Many => count.lt_eq(lit(u8::MAX)), }), ) - .then( - match concat_list([lit(self.get_shape().get_label()), prev.to_owned()]) { - Ok(concat) => concat, - Err(_) => prev.to_owned(), - }, - ) + .then(match concat_list([lit(self.label), prev.to_owned()]) { + Ok(concat) => concat, + Err(_) => prev.to_owned(), + }) .otherwise(prev) } } diff --git a/src/utils/examples.rs b/src/utils/examples.rs index de60716..ec11fd5 100644 --- a/src/utils/examples.rs +++ b/src/utils/examples.rs @@ -275,6 +275,7 @@ pub fn optional_schema() -> Shape { ) .into(), Cardinality::new( + "cardinality", TripleConstraint::new( "SomeAwardReceived", AwardReceived.id(), @@ -313,11 +314,12 @@ pub fn any_schema() -> Shape { pub fn cardinality_schema() -> Shape { ShapeAnd::new( - "Cardinality", + "grouping", vec![ TripleConstraint::new("Human", InstanceOf.id(), NodeConstraint::Value(Human.id())) .into(), Cardinality::new( + "cardinality", TripleConstraint::new("BirthPlace", BirthPlace.id(), NodeConstraint::Any).into(), Bound::Zero, Bound::Many, @@ -327,3 +329,22 @@ pub fn cardinality_schema() -> Shape { ) .into() } + +pub fn vprog_to_vprog() -> Shape { + Cardinality::new( + "cardinality", + ShapeAnd::new( + "grouping", + vec![TripleConstraint::new( + "UnitedKingdom", + BirthPlace.id(), + NodeConstraint::Value(London.id()), + ) + .into()], + ) + .into(), + Bound::Inclusive(1), + Bound::Inclusive(2), + ) + .into() +}