diff --git a/src/main/resources/configs/deduplication_dbpedia.xml b/src/main/resources/configs/deduplication_dbpedia.xml index b381b5ff..42bdbb09 100644 --- a/src/main/resources/configs/deduplication_dbpedia.xml +++ b/src/main/resources/configs/deduplication_dbpedia.xml @@ -19,14 +19,14 @@ id_dbpedia - ExactMatchString + ExactMatch 1 id_wikidata - ExactMatchString + ExactMatch 1 diff --git a/src/main/scala/de/hpi/ingestion/deduplication/similarity/ExactMatch.scala b/src/main/scala/de/hpi/ingestion/deduplication/similarity/ExactMatch.scala index 4ceff6e4..4d30509e 100644 --- a/src/main/scala/de/hpi/ingestion/deduplication/similarity/ExactMatch.scala +++ b/src/main/scala/de/hpi/ingestion/deduplication/similarity/ExactMatch.scala @@ -18,9 +18,8 @@ package de.hpi.ingestion.deduplication.similarity /** * An abstract binary similarity measure for exact matching - * @tparam T the type of data to be compared */ -abstract class ExactMatch[T] extends SimilarityMeasure[T] { +object ExactMatch extends SimilarityMeasure[Any] { /** * Comparing the given objects on exact matching * @param x object to be compared to y @@ -28,15 +27,5 @@ abstract class ExactMatch[T] extends SimilarityMeasure[T] { * @param u has no specific use in here * @return 1.0 if given objects match exactly, 0.0 otherwise */ - override def compare(x: T, y: T, u: Int = 1) = if(x == y) 1.0 else 0.0 + override def compare(x: Any, y: Any, u: Int = 1) = if(x == y) 1.0 else 0.0 } - -/** - * A specific exact match similarity measure comparing strings - */ -object ExactMatchString extends ExactMatch[String] - -/** - * A specific exact match similarity measure comparing Doubles - */ -object ExactMatchDouble extends ExactMatch[Double] diff --git a/src/main/scala/de/hpi/ingestion/deduplication/similarity/SimilarityMeasure.scala b/src/main/scala/de/hpi/ingestion/deduplication/similarity/SimilarityMeasure.scala index 20c285fc..d39a2104 100644 --- a/src/main/scala/de/hpi/ingestion/deduplication/similarity/SimilarityMeasure.scala +++ b/src/main/scala/de/hpi/ingestion/deduplication/similarity/SimilarityMeasure.scala @@ -20,7 +20,7 @@ package de.hpi.ingestion.deduplication.similarity * Provides a method to measure the similarity of two objects * @tparam T the type of the objects to be compared */ -trait SimilarityMeasure[T] extends Serializable { +trait SimilarityMeasure[-T] extends Serializable { /** * Calculates a similarity score for two objects @@ -38,8 +38,7 @@ trait SimilarityMeasure[T] extends Serializable { */ object SimilarityMeasure { val dataTypes: Map[String, SimilarityMeasure[_]] = Map( - "ExactMatchString" -> ExactMatchString, - "ExactMatchDouble" -> ExactMatchDouble, + "ExactMatch" -> ExactMatch, "MongeElkan" -> MongeElkan, "Jaccard" -> Jaccard, "DiceSorensen" -> DiceSorensen, @@ -60,6 +59,6 @@ object SimilarityMeasure { * @return the requested Similarity Measure if it exists or else Exact Match String as default */ def get[T](similarityMeasure: String): SimilarityMeasure[T] = { - dataTypes.getOrElse(similarityMeasure, ExactMatchString).asInstanceOf[SimilarityMeasure[T]] + dataTypes.getOrElse(similarityMeasure, ExactMatch).asInstanceOf[SimilarityMeasure[T]] } } diff --git a/src/test/resources/defaultDeduplication b/src/test/resources/defaultDeduplication index 26496fa0..818ed6a9 100644 --- a/src/test/resources/defaultDeduplication +++ b/src/test/resources/defaultDeduplication @@ -27,7 +27,7 @@ name - ExactMatchString + ExactMatch 0.2 1 diff --git a/src/test/resources/framework/test3.xml b/src/test/resources/framework/test3.xml index 8dfc0995..d5edb8ef 100644 --- a/src/test/resources/framework/test3.xml +++ b/src/test/resources/framework/test3.xml @@ -15,7 +15,7 @@ category - ExactMatchString + ExactMatch 1 diff --git a/src/test/scala/de/hpi/ingestion/deduplication/FeatureCalculationTest.scala b/src/test/scala/de/hpi/ingestion/deduplication/FeatureCalculationTest.scala index 9f8040ce..86274983 100644 --- a/src/test/scala/de/hpi/ingestion/deduplication/FeatureCalculationTest.scala +++ b/src/test/scala/de/hpi/ingestion/deduplication/FeatureCalculationTest.scala @@ -19,12 +19,12 @@ package de.hpi.ingestion.deduplication import com.holdenkarau.spark.testing.{RDDComparisons, SharedSparkContext} import de.hpi.ingestion.deduplication.models.FeatureEntry import de.hpi.ingestion.deduplication.models.config.SimilarityMeasureConfig -import de.hpi.ingestion.deduplication.similarity.{ExactMatchString, SimilarityMeasure} +import de.hpi.ingestion.deduplication.similarity.{ExactMatch, SimilarityMeasure} import org.scalatest.{FlatSpec, Matchers} class FeatureCalculationTest extends FlatSpec with Matchers with SharedSparkContext with RDDComparisons { "compare" should "calculate a similarity score of two subjects from a given config" in { - val config = SimilarityMeasureConfig[String, SimilarityMeasure[String]](ExactMatchString, 1.0) + val config = SimilarityMeasureConfig[String, SimilarityMeasure[String]](ExactMatch, 1.0) val attribute = "geo_city" val subject = TestData.subjects.head.get(attribute) val staging = TestData.stagings.head.get(attribute) @@ -35,7 +35,7 @@ class FeatureCalculationTest extends FlatSpec with Matchers with SharedSparkCont } it should "return 0.0 if one of the given subjects doesn't hold a property" in { - val config = SimilarityMeasureConfig[String, SimilarityMeasure[String]](ExactMatchString, 1.0) + val config = SimilarityMeasureConfig[String, SimilarityMeasure[String]](ExactMatch, 1.0) val attribute = "geo_city" val subject = TestData.subjects.head.get(attribute) val staging = TestData.subjects.last.get(attribute) diff --git a/src/test/scala/de/hpi/ingestion/deduplication/similarity/ExactMatchUnitTest.scala b/src/test/scala/de/hpi/ingestion/deduplication/similarity/ExactMatchUnitTest.scala index 8737277b..15277365 100644 --- a/src/test/scala/de/hpi/ingestion/deduplication/similarity/ExactMatchUnitTest.scala +++ b/src/test/scala/de/hpi/ingestion/deduplication/similarity/ExactMatchUnitTest.scala @@ -26,7 +26,7 @@ class ExactMatchUnitTest extends FlatSpec with Matchers { ("context", "context", 1.0)) testData.foreach(tuple => - ExactMatchString.compare(tuple._1, tuple._2) shouldEqual tuple._3) + ExactMatch.compare(tuple._1, tuple._2) shouldEqual tuple._3) } it should "return 1.0 or 0.0 for given doubles" in { @@ -35,6 +35,6 @@ class ExactMatchUnitTest extends FlatSpec with Matchers { (0.2, 0.4, 0.0)) testData.foreach(tuple => - ExactMatchDouble.compare(tuple._1, tuple._2) shouldEqual tuple._3) + ExactMatch.compare(tuple._1, tuple._2) shouldEqual tuple._3) } } diff --git a/src/test/scala/de/hpi/ingestion/deduplication/similarity/SimilarityMeasureTest.scala b/src/test/scala/de/hpi/ingestion/deduplication/similarity/SimilarityMeasureTest.scala index 226e530d..984a0d0f 100644 --- a/src/test/scala/de/hpi/ingestion/deduplication/similarity/SimilarityMeasureTest.scala +++ b/src/test/scala/de/hpi/ingestion/deduplication/similarity/SimilarityMeasureTest.scala @@ -21,8 +21,7 @@ import org.scalatest.{FlatSpec, Matchers} class SimilarityMeasureTest extends FlatSpec with Matchers { "Similarity Measure" should "be returned given its name" in { - SimilarityMeasure.get[String]("ExactMatchString") shouldEqual ExactMatchString - SimilarityMeasure.get[Double]("ExactMatchDouble") shouldEqual ExactMatchDouble + SimilarityMeasure.get[String]("ExactMatch") shouldEqual ExactMatch SimilarityMeasure.get[String]("MongeElkan") shouldEqual MongeElkan SimilarityMeasure.get[String]("Jaccard") shouldEqual Jaccard SimilarityMeasure.get[String]("DiceSorensen") shouldEqual DiceSorensen @@ -32,6 +31,6 @@ class SimilarityMeasureTest extends FlatSpec with Matchers { SimilarityMeasure.get[String]("Overlap") shouldEqual Overlap SimilarityMeasure.get[String]("EuclidianDistance") shouldEqual EuclidianDistance SimilarityMeasure.get[String]("RelativeNumbersSimilarity") shouldEqual RelativeNumbersSimilarity - SimilarityMeasure.get[String]("Not existing") shouldEqual ExactMatchString + SimilarityMeasure.get[Any]("Not existing") shouldEqual ExactMatch } } diff --git a/src/test/scala/de/hpi/ingestion/framework/TestData.scala b/src/test/scala/de/hpi/ingestion/framework/TestData.scala index 2637a93d..dd30a987 100644 --- a/src/test/scala/de/hpi/ingestion/framework/TestData.scala +++ b/src/test/scala/de/hpi/ingestion/framework/TestData.scala @@ -17,7 +17,7 @@ limitations under the License. package de.hpi.ingestion.framework import de.hpi.ingestion.deduplication.models.config.{AttributeConfig, SimilarityMeasureConfig} -import de.hpi.ingestion.deduplication.similarity.{ExactMatchString, JaroWinkler, MongeElkan} +import de.hpi.ingestion.deduplication.similarity.{ExactMatch, JaroWinkler, MongeElkan, SimilarityMeasure} import scala.xml.{Node, XML} @@ -48,7 +48,7 @@ object TestData { "category", 0.5, List( - SimilarityMeasureConfig(similarityMeasure = ExactMatchString, weight = 1.0) + SimilarityMeasureConfig(similarityMeasure = ExactMatch, weight = 1.0) ) ) )