From ceea7379712995f3ff9d832ac0be0662fdaab620 Mon Sep 17 00:00:00 2001 From: Edmond Date: Fri, 13 Dec 2024 22:15:34 +1100 Subject: [PATCH 1/4] feat: use the RGDA1 canonicalization algorithm + lexical n-triples sort to produce deterministic longturtle serialisation --- rdflib/plugins/serializers/longturtle.py | 38 ++---- test/data/longturtle/longturtle-target.ttl | 82 ++++++------ .../test_serializer_longturtle_sort.py | 117 ++++++++++++++++++ 3 files changed, 167 insertions(+), 70 deletions(-) create mode 100644 test/test_serializers/test_serializer_longturtle_sort.py diff --git a/rdflib/plugins/serializers/longturtle.py b/rdflib/plugins/serializers/longturtle.py index f596a7ded..ed5e385d1 100644 --- a/rdflib/plugins/serializers/longturtle.py +++ b/rdflib/plugins/serializers/longturtle.py @@ -20,7 +20,9 @@ from typing import IO, Any, Optional +from rdflib.compare import to_canonical_graph from rdflib.exceptions import Error +from rdflib.graph import Graph from rdflib.namespace import RDF from rdflib.term import BNode, Literal, URIRef @@ -42,7 +44,14 @@ class LongTurtleSerializer(RecursiveSerializer): def __init__(self, store): self._ns_rewrite = {} - super(LongTurtleSerializer, self).__init__(store) + store = to_canonical_graph(store) + content = store.serialize(format="application/n-triples") + lines = content.split("\n") + lines.sort() + graph = Graph() + graph.parse(data="\n".join(lines), format="nt", skolemize=True) + graph = graph.de_skolemize() + super(LongTurtleSerializer, self).__init__(graph) self.keywords = {RDF.type: "a"} self.reset() self.stream = None @@ -293,34 +302,7 @@ def predicateList(self, subject, newline=False): def verb(self, node, newline=False): self.path(node, VERB, newline) - def sortObjects( - self, values: list[URIRef | BNode | Literal] - ) -> list[URIRef | BNode | Literal]: - """ - Perform a sort on the values where each value is a blank node. Grab the CBD of the - blank node and sort it by its longturtle serialization value. - - Identified nodes come first and the sorted blank nodes are tacked on after. - """ - bnode_map: dict[BNode, list[str]] = {} - objects = [] - for value in values: - if isinstance(value, BNode): - bnode_map[value] = [] - else: - objects.append(value) - - for bnode in bnode_map: - cbd = self.store.cbd(bnode).serialize(format="longturtle") - bnode_map[bnode].append(cbd) - - sorted_bnodes = sorted( - [(k, v) for k, v in bnode_map.items()], key=lambda x: x[1] - ) - return objects + [x[0] for x in sorted_bnodes] - def objectList(self, objects): - objects = self.sortObjects(objects) count = len(objects) if count == 0: return diff --git a/test/data/longturtle/longturtle-target.ttl b/test/data/longturtle/longturtle-target.ttl index 329f2ca0c..54cf23e9f 100644 --- a/test/data/longturtle/longturtle-target.ttl +++ b/test/data/longturtle/longturtle-target.ttl @@ -1,74 +1,72 @@ -PREFIX cn: -PREFIX ex: PREFIX geo: PREFIX rdf: -PREFIX sdo: +PREFIX schema: PREFIX xsd: -ex:nicholas - a sdo:Person ; - sdo:age 41 ; - sdo:alternateName - "N.J. Car" , - "Nick Car" , + + a schema:Person ; + schema:age 41 ; + schema:alternateName [ - sdo:name "Dr N.J. Car" ; - ] ; - sdo:name + schema:name "Dr N.J. Car" ; + ] , + "N.J. Car" , + "Nick Car" ; + schema:name [ - a cn:CompoundName ; - sdo:hasPart - [ - a cn:CompoundName ; - rdf:value "John" ; - ] , + a ; + schema:hasPart [ - a cn:CompoundName ; - rdf:value "Nicholas" ; - ] , - [ - a cn:CompoundName ; - sdo:hasPart + a ; + schema:hasPart [ - a cn:CompoundName ; + a ; rdf:value "Car" ; ] , [ - a cn:CompoundName ; + a ; rdf:value "Maxov" ; ] ; + ] , + [ + a ; + rdf:value "Nicholas" ; + ] , + [ + a ; + rdf:value "John" ; ] ; ] ; - sdo:worksFor ; + schema:worksFor ; . - a sdo:Organization ; - sdo:location ; + a schema:Organization ; + schema:location ; . - a sdo:Place ; - sdo:address + a schema:Place ; + schema:address [ - a sdo:PostalAddress ; - sdo:addressCountry + a schema:PostalAddress ; + schema:addressCountry [ - sdo:identifier "au" ; - sdo:name "Australia" ; + schema:identifier "au" ; + schema:name "Australia" ; ] ; - sdo:addressLocality "Shorncliffe" ; - sdo:addressRegion "QLD" ; - sdo:postalCode 4017 ; - sdo:streetAddress ( + schema:addressLocality "Shorncliffe" ; + schema:addressRegion "QLD" ; + schema:postalCode 4017 ; + schema:streetAddress ( 72 "Yundah" "Street" ) ; ] ; - sdo:geo + schema:geo [ - sdo:polygon "POLYGON((153.082403 -27.325801, 153.08241 -27.32582, 153.082943 -27.325612, 153.083010 -27.325742, 153.083543 -27.325521, 153.083456 -27.325365, 153.082403 -27.325801))"^^geo:wktLiteral ; + schema:polygon "POLYGON((153.082403 -27.325801, 153.08241 -27.32582, 153.082943 -27.325612, 153.083010 -27.325742, 153.083543 -27.325521, 153.083456 -27.325365, 153.082403 -27.325801))"^^geo:wktLiteral ; ] ; - sdo:name "KurrawongAI HQ" ; + schema:name "KurrawongAI HQ" ; . diff --git a/test/test_serializers/test_serializer_longturtle_sort.py b/test/test_serializers/test_serializer_longturtle_sort.py new file mode 100644 index 000000000..9eba6e187 --- /dev/null +++ b/test/test_serializers/test_serializer_longturtle_sort.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + +# Portions of this file contributed by NIST are governed by the +# following statement: +# +# This software was developed at the National Institute of Standards +# and Technology by employees of the Federal Government in the course +# of their official duties. Pursuant to Title 17 Section 105 of the +# United States Code, this software is not subject to copyright +# protection within the United States. NIST assumes no responsibility +# whatsoever for its use by other parties, and makes no guarantees, +# expressed or implied, about its quality, reliability, or any other +# characteristic. +# +# We would appreciate acknowledgement if the software is used. + +import random +from collections import defaultdict + +from rdflib import RDFS, BNode, Graph, Literal, Namespace, URIRef + +EX = Namespace("http://example.org/ex/") + + +def test_sort_semiblank_graph() -> None: + """ + This test reviews whether the output of the Turtle form is + consistent when involving repeated generates with blank nodes. + """ + + serialization_counter: defaultdict[str, int] = defaultdict(int) + + first_graph_text: str = "" + + # Use a fixed sequence of once-but-no-longer random values for more + # consistent test results. + nonrandom_shuffler = random.Random(1234) + for x in range(1, 10): + graph = Graph() + graph.bind("ex", EX) + graph.bind("rdfs", RDFS) + + graph.add((EX.A, RDFS.comment, Literal("Thing A"))) + graph.add((EX.B, RDFS.comment, Literal("Thing B"))) + graph.add((EX.C, RDFS.comment, Literal("Thing C"))) + + nodes: list[URIRef] = [EX.A, EX.B, EX.C, EX.B] + nonrandom_shuffler.shuffle(nodes) + for node in nodes: + # Instantiate one bnode per URIRef node. + graph.add((BNode(), RDFS.seeAlso, node)) + + nesteds: list[URIRef] = [EX.A, EX.B, EX.C] + nonrandom_shuffler.shuffle(nesteds) + for nested in nesteds: + # Instantiate a nested node reference. + outer_node = BNode() + inner_node = BNode() + graph.add((outer_node, EX.has, inner_node)) + graph.add((inner_node, RDFS.seeAlso, nested)) + + graph_text = graph.serialize(format="longturtle", sort=True) + if first_graph_text == "": + first_graph_text = graph_text + + serialization_counter[graph_text] += 1 + + expected_serialization = """\ +PREFIX ns1: +PREFIX rdfs: + +ns1:A + rdfs:comment "Thing A" ; +. + +ns1:C + rdfs:comment "Thing C" ; +. + +ns1:B + rdfs:comment "Thing B" ; +. + +[] ns1:has + [ + rdfs:seeAlso ns1:A ; + ] ; ; +. + +[] rdfs:seeAlso ns1:B ; ; +. + +[] ns1:has + [ + rdfs:seeAlso ns1:C ; + ] ; ; +. + +[] rdfs:seeAlso ns1:A ; ; +. + +[] rdfs:seeAlso ns1:C ; ; +. + +[] rdfs:seeAlso ns1:B ; ; +. + +[] ns1:has + [ + rdfs:seeAlso ns1:B ; + ] ; ; +. + +""" + + assert expected_serialization.strip() == first_graph_text.strip() + assert 1 == len(serialization_counter) From e4845dae757a07599112c086610c82e077da2ae0 Mon Sep 17 00:00:00 2001 From: Edmond Date: Fri, 13 Dec 2024 22:22:05 +1100 Subject: [PATCH 2/4] chore: normalise usage of format --- rdflib/plugins/serializers/longturtle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdflib/plugins/serializers/longturtle.py b/rdflib/plugins/serializers/longturtle.py index ed5e385d1..6626cb4c7 100644 --- a/rdflib/plugins/serializers/longturtle.py +++ b/rdflib/plugins/serializers/longturtle.py @@ -49,7 +49,7 @@ def __init__(self, store): lines = content.split("\n") lines.sort() graph = Graph() - graph.parse(data="\n".join(lines), format="nt", skolemize=True) + graph.parse(data="\n".join(lines), format="application/n-triples", skolemize=True) graph = graph.de_skolemize() super(LongTurtleSerializer, self).__init__(graph) self.keywords = {RDF.type: "a"} From 7405e32109cca9c63cddb54a1f689b568c7aef04 Mon Sep 17 00:00:00 2001 From: Edmond Date: Sat, 14 Dec 2024 00:03:26 +1100 Subject: [PATCH 3/4] chore: apply black --- rdflib/plugins/serializers/longturtle.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rdflib/plugins/serializers/longturtle.py b/rdflib/plugins/serializers/longturtle.py index 6626cb4c7..cc245441c 100644 --- a/rdflib/plugins/serializers/longturtle.py +++ b/rdflib/plugins/serializers/longturtle.py @@ -49,7 +49,9 @@ def __init__(self, store): lines = content.split("\n") lines.sort() graph = Graph() - graph.parse(data="\n".join(lines), format="application/n-triples", skolemize=True) + graph.parse( + data="\n".join(lines), format="application/n-triples", skolemize=True + ) graph = graph.de_skolemize() super(LongTurtleSerializer, self).__init__(graph) self.keywords = {RDF.type: "a"} From 412fb5d63d79c5d5816e05751e6be6bd791b1385 Mon Sep 17 00:00:00 2001 From: Edmond Date: Sat, 14 Dec 2024 03:06:01 +1100 Subject: [PATCH 4/4] fix: double up of semicolons when subject is a blank node --- rdflib/plugins/serializers/longturtle.py | 2 +- .../test_serializer_longturtle_sort.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/rdflib/plugins/serializers/longturtle.py b/rdflib/plugins/serializers/longturtle.py index cc245441c..8de1e52a2 100644 --- a/rdflib/plugins/serializers/longturtle.py +++ b/rdflib/plugins/serializers/longturtle.py @@ -197,7 +197,7 @@ def s_squared(self, subject): return False self.write("\n" + self.indent() + "[]") self.predicateList(subject, newline=False) - self.write(" ;\n.") + self.write("\n.") return True def path(self, node, position, newline=False): diff --git a/test/test_serializers/test_serializer_longturtle_sort.py b/test/test_serializers/test_serializer_longturtle_sort.py index 9eba6e187..df0e38676 100644 --- a/test/test_serializers/test_serializer_longturtle_sort.py +++ b/test/test_serializers/test_serializer_longturtle_sort.py @@ -84,31 +84,31 @@ def test_sort_semiblank_graph() -> None: [] ns1:has [ rdfs:seeAlso ns1:A ; - ] ; ; + ] ; . -[] rdfs:seeAlso ns1:B ; ; +[] rdfs:seeAlso ns1:B ; . [] ns1:has [ rdfs:seeAlso ns1:C ; - ] ; ; + ] ; . -[] rdfs:seeAlso ns1:A ; ; +[] rdfs:seeAlso ns1:A ; . -[] rdfs:seeAlso ns1:C ; ; +[] rdfs:seeAlso ns1:C ; . -[] rdfs:seeAlso ns1:B ; ; +[] rdfs:seeAlso ns1:B ; . [] ns1:has [ rdfs:seeAlso ns1:B ; - ] ; ; + ] ; . """