From 4511da95c38ec978537295ab5d4ba374a9399961 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Champin Date: Sun, 3 Mar 2024 08:49:10 +0100 Subject: [PATCH] fix issues in c14n - some new tests from the test suite didn't pass (related to duplicate triples) - there was a bug when sorting quads, when only one of them had a graph name --- c14n/src/_c14n_term.rs | 12 ++++++++---- c14n/src/rdfc10.rs | 33 +++++++++++++++++++++++---------- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/c14n/src/_c14n_term.rs b/c14n/src/_c14n_term.rs index 8d437ec8..1c92e79b 100644 --- a/c14n/src/_c14n_term.rs +++ b/c14n/src/_c14n_term.rs @@ -82,12 +82,16 @@ impl Term for C14nTerm { } pub fn cmp_c14n_terms<'a, 'b, T: Term>( - t1: &'a C14nTerm, - t2: &'a C14nTerm, + t1: Option<&'a C14nTerm>, + t2: Option<&'a C14nTerm>, buf1: &'b mut String, buf2: &'b mut String, ) -> Ordering { - nq(t1, buf1); - nq(t2, buf2); + if let Some(t1) = t1 { + nq(t1, buf1); + } + if let Some(t2) = t2 { + nq(t2, buf2); + } buf1.cmp(&buf2) } diff --git a/c14n/src/rdfc10.rs b/c14n/src/rdfc10.rs index 2c922010..a0a5e881 100644 --- a/c14n/src/rdfc10.rs +++ b/c14n/src/rdfc10.rs @@ -7,7 +7,7 @@ use std::fmt::Write; use std::io; use std::rc::Rc; -use sophia_api::dataset::{DTerm, Dataset}; +use sophia_api::dataset::{DTerm, SetDataset}; use sophia_api::quad::{iter_spog, Quad, Spog}; use sophia_api::term::{BnodeId, Term}; @@ -25,7 +25,7 @@ use crate::hash::{HashFunction, Sha256, Sha384}; /// - quads are sorted in codepoint order. /// /// See also [`normalize_with`]. -pub fn normalize(d: &D, w: W) -> Result<(), C14nError> { +pub fn normalize(d: &D, w: W) -> Result<(), C14nError> { normalize_with::(d, w, DEFAULT_DEPTH_FACTOR, DEFAULT_PERMUTATION_LIMIT) } @@ -37,7 +37,10 @@ pub fn normalize(d: &D, w: W) -> Result<(), C14nError< /// - quads are sorted in codepoint order. /// /// See also [`normalize_with`]. -pub fn normalize_sha384(d: &D, w: W) -> Result<(), C14nError> { +pub fn normalize_sha384( + d: &D, + w: W, +) -> Result<(), C14nError> { normalize_with::(d, w, DEFAULT_DEPTH_FACTOR, DEFAULT_PERMUTATION_LIMIT) } @@ -49,7 +52,7 @@ pub fn normalize_sha384(d: &D, w: W) -> Result<(), C14 /// - quads are sorted in codepoint order. /// /// See also [`normalize`]. -pub fn normalize_with( +pub fn normalize_with( d: &D, mut w: W, depth_factor: f32, @@ -61,7 +64,7 @@ pub fn normalize_with( // we sort the quads, but comparing the terms based on ther NQ serialization, // which amounts to sorting the N-Quads lines without materializing them quads.sort_unstable_by(|q1, q2| { - for (t1, t2) in iter_spog(q1.spog()).zip(iter_spog(q2.spog())) { + for (t1, t2) in iter_spog_opt(q1.spog()).zip(iter_spog_opt(q2.spog())) { buf1.clear(); buf2.clear(); let o = cmp_c14n_terms(t1, t2, &mut buf1, &mut buf2); @@ -95,7 +98,7 @@ pub fn normalize_with( /// Implements /// /// See also [`normalize`]. -pub fn relabel(d: &D) -> Result<(C14nQuads, C14nIdMap), C14nError> { +pub fn relabel(d: &D) -> Result<(C14nQuads, C14nIdMap), C14nError> { relabel_with::(d, DEFAULT_DEPTH_FACTOR, DEFAULT_PERMUTATION_LIMIT) } @@ -109,7 +112,9 @@ pub fn relabel(d: &D) -> Result<(C14nQuads, C14nIdMap), C14nError /// Implements /// /// See also [`normalize`]. -pub fn relabel_sha384(d: &D) -> Result<(C14nQuads, C14nIdMap), C14nError> { +pub fn relabel_sha384( + d: &D, +) -> Result<(C14nQuads, C14nIdMap), C14nError> { relabel_with::(d, DEFAULT_DEPTH_FACTOR, DEFAULT_PERMUTATION_LIMIT) } @@ -135,7 +140,7 @@ pub fn relabel_sha384(d: &D) -> Result<(C14nQuads, C14nIdMap), C1 /// Implements /// /// See also [`relabel`], [`normalize_with`]. -pub fn relabel_with<'a, H: HashFunction, D: Dataset>( +pub fn relabel_with<'a, H: HashFunction, D: SetDataset>( d: &'a D, depth_factor: f32, permutation_limit: usize, @@ -497,6 +502,14 @@ fn smaller_path(path1: &str, path2: &str) -> bool { } } +/// Iter over all the components of a [`Quad`] as Option. +/// +/// Compared to [`iter_spog`], this function always return 4 components. +fn iter_spog_opt(q: T) -> impl Iterator> { + let (spo, g) = q.to_spog(); + spo.into_iter().map(Some).chain(std::iter::once(g)) +} + #[cfg(test)] mod test { use super::*; @@ -701,7 +714,7 @@ _:c14n4 _:c14n3 . assert!(got == exp); } - pub fn c14n_nquads(d: &D) -> Result> { + pub fn c14n_nquads(d: &D) -> Result> { let mut output = Vec::::new(); normalize(d, &mut output)?; Ok(unsafe { String::from_utf8_unchecked(output) }) @@ -709,7 +722,7 @@ _:c14n4 _:c14n3 . /// Simplisitic Quad parser, useful for writing test cases. /// It is based on eq_quad below. - fn ez_quads<'a>(lines: &[&'a str]) -> Vec>> { + fn ez_quads<'a>(lines: &[&'a str]) -> std::collections::HashSet>> { lines.iter().map(|line| ez_quad(line)).collect() }