-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_similarity.py
148 lines (110 loc) · 4.57 KB
/
check_similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# -*- coding: utf-8 -*-
"""
[Martinez-Gil2023d] Framework to Automatically Determine the Quality of Open Data Catalogs, arXiv preprint arXiv:2307.15464, 2023
@author: Jorge Martinez-Gil
"""
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from rdflib import Graph, Namespace
import sys
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
def preprocess_text(text):
"""
Preprocesses a text by tokenizing it into sentences and words, converting to lowercase, and removing stopwords.
Args:
text (str): The text to preprocess.
Returns:
set: A set of preprocessed words.
"""
# Tokenize the sentences and convert to lowercase
sentences = sent_tokenize(text.lower())
# Tokenize words in each sentence and remove stopwords
stop_words = set(stopwords.words('english'))
tokens = []
for sentence in sentences:
words = word_tokenize(sentence)
words = [word for word in words if word not in stop_words]
tokens.extend(words)
return set(tokens)
def jaccard_similarity(set1, set2):
"""
Calculates the Jaccard similarity between two sets.
Args:
set1 (set): The first set.
set2 (set): The second set.
Returns:
float: The Jaccard similarity between the two sets.
"""
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
return intersection / union if union != 0 else 0
def are_graphs_identical(g1, g2):
"""
Checks if two RDF graphs are identical.
Args:
g1 (rdflib.Graph): The first RDF graph.
g2 (rdflib.Graph): The second RDF graph.
Returns:
bool: True if the graphs are identical, False otherwise.
"""
return len(g1) == len(g2) and all(t in g2 for t in g1) and all(t in g1 for t in g2)
def check_similarity(catalog1_file, catalog2_file):
"""
Calculates the similarity between two DCAT catalogs in Turtle format.
Args:
catalog1_file (str): The path to the first catalog file.
catalog2_file (str): The path to the second catalog file.
Returns:
float: The similarity between the two catalogs as a percentage.
"""
# Parse Turtle files into RDF graph
g1 = Graph()
g1.parse(catalog1_file, format='ttl')
g2 = Graph()
g2.parse(catalog2_file, format='ttl')
# Check if the graphs are identical
if are_graphs_identical(g1, g2):
return 100.0
# Define DCAT namespace
dcat = Namespace('http://www.w3.org/ns/dcat#')
# Extract titles and descriptions from both catalogs
titles1 = [str(title) for title in g1.objects(predicate=dcat.title)]
titles2 = [str(title) for title in g2.objects(predicate=dcat.title)]
descriptions1 = [str(desc) for desc in g1.objects(predicate=dcat.description)]
descriptions2 = [str(desc) for desc in g2.objects(predicate=dcat.description)]
# Preprocess titles and descriptions
titles1 = [preprocess_text(title) for title in titles1]
titles2 = [preprocess_text(title) for title in titles2]
descriptions1 = [preprocess_text(desc) for desc in descriptions1]
descriptions2 = [preprocess_text(desc) for desc in descriptions2]
# Calculate Jaccard similarity for titles and descriptions
if titles1 and titles2:
title_similarity = sum(jaccard_similarity(title1, title2) for title1 in titles1 for title2 in titles2) / (len(titles1) * len(titles2))
else:
title_similarity = 0.0
if descriptions1 and descriptions2:
description_similarity = sum(jaccard_similarity(desc1, desc2) for desc1 in descriptions1 for desc2 in descriptions2) / (len(descriptions1) * len(descriptions2))
else:
description_similarity = 0.0
# Overall similarity as the average of title and description similarity
overall_similarity = (title_similarity + description_similarity) / 2
return overall_similarity * 100
"""
Main function that calculates the similarity between two DCAT catalogs in Turtle format and prints the result.
Usage: python check_similarity.py filepath filepath2
"""
def main():
# Get paths to RDF data files from command line arguments
if len(sys.argv) < 3:
print("Usage: python check_similarity.py filepath filepath2")
sys.exit(1)
rdf_data_path = sys.argv[1]
rdf_data_path2 = sys.argv[2]
# Calculate similarity
result = check_similarity(rdf_data_path, rdf_data_path2)
print(f"The similarity of {rdf_data_path} and {rdf_data_path2} is {result}%.")
if __name__ == "__main__":
main()