diff --git a/app/src/main/java/me/devsaki/hentoid/util/ContentHelper.java b/app/src/main/java/me/devsaki/hentoid/util/ContentHelper.java index b9097a855f..39defa0dd4 100644 --- a/app/src/main/java/me/devsaki/hentoid/util/ContentHelper.java +++ b/app/src/main/java/me/devsaki/hentoid/util/ContentHelper.java @@ -1537,8 +1537,8 @@ public static ImmutablePair findDuplicate(@NonNull final Context // Refine by running the actual duplicate detection algorithm against the rough candidates List entries = new ArrayList<>(); StringSimilarity cosine = new Cosine(); - DuplicateHelper.DuplicateCandidate reference = new DuplicateHelper.DuplicateCandidate(content, useTitle, useArtist, useLanguage, useCover, pHash); - List candidates = Stream.of(roughCandidates).map(c -> new DuplicateHelper.DuplicateCandidate(c, useTitle, useArtist, useLanguage, useCover, Long.MIN_VALUE)).toList(); + DuplicateHelper.DuplicateCandidate reference = new DuplicateHelper.DuplicateCandidate(content, useTitle, useArtist, useLanguage, useCover, true, pHash); + List candidates = Stream.of(roughCandidates).map(c -> new DuplicateHelper.DuplicateCandidate(c, useTitle, useArtist, useLanguage, useCover, true, Long.MIN_VALUE)).toList(); for (DuplicateHelper.DuplicateCandidate candidate : candidates) { DuplicateEntry entry = DuplicateHelper.Companion.processContent(reference, candidate, useTitle, useCover, useArtist, useLanguage, true, 2, cosine); if (entry != null) entries.add(entry); diff --git a/app/src/main/java/me/devsaki/hentoid/util/DuplicateHelper.kt b/app/src/main/java/me/devsaki/hentoid/util/DuplicateHelper.kt index d1728c7a2a..f6908ae8a8 100644 --- a/app/src/main/java/me/devsaki/hentoid/util/DuplicateHelper.kt +++ b/app/src/main/java/me/devsaki/hentoid/util/DuplicateHelper.kt @@ -13,10 +13,11 @@ import me.devsaki.hentoid.util.file.FileHelper import me.devsaki.hentoid.util.image.ImageHelper import me.devsaki.hentoid.util.image.ImagePHash import me.devsaki.hentoid.util.string_similarity.StringSimilarity +import org.apache.commons.lang3.tuple.ImmutableTriple import timber.log.Timber import java.io.IOException import java.io.InputStream -import java.util.* +import java.util.Collections import java.util.concurrent.atomic.AtomicBoolean class DuplicateHelper { @@ -155,7 +156,8 @@ class DuplicateHelper { try { // Update the book JSON if the book folder still exists if (content.storageUri.isNotEmpty()) { - val folder = FileHelper.getDocumentFromTreeUriString(context, content.storageUri) + val folder = + FileHelper.getDocumentFromTreeUriString(context, content.storageUri) if (folder != null) { if (content.jsonUri.isNotEmpty()) ContentHelper.updateJson( context, @@ -200,8 +202,8 @@ class DuplicateHelper { } if (useTitle) titleScore = computeTitleScore( textComparator, - reference.titleCleanup, reference.titleNoDigits, - candidate.titleCleanup, candidate.titleNoDigits, + reference, + candidate, ignoreChapters, sensitivity ) @@ -250,31 +252,30 @@ class DuplicateHelper { fun computeTitleScore( textComparator: StringSimilarity, - referenceTitleCleanup: String, - referenceTitleNoDigits: String, - candidateTitleCleanup: String, - candidateTitleNoDigits: String, + reference: DuplicateCandidate, + candidate: DuplicateCandidate, ignoreChapters: Boolean, sensitivity: Int ): Float { val similarity1 = - textComparator.similarity(referenceTitleCleanup, candidateTitleCleanup) + textComparator.similarity(reference.titleCleanup, candidate.titleCleanup) if (ignoreChapters) { // Perfect match if (similarity1 > 0.995) return similarity1.toFloat() // Other cases : check if both titles are chapters or sequels return if (similarity1 > TEXT_THRESHOLDS[sensitivity]) { val similarity2 = - textComparator.similarity(referenceTitleNoDigits, candidateTitleNoDigits) + textComparator.similarity(reference.titleNoDigits, candidate.titleNoDigits) // Cleaned up versions are identical - // => most probably a chapter variant -> set to 0% - if (similarity2 > similarity1 && similarity2 > 0.995) return 0f + // => most probably a chapter variant + if (similarity2 > similarity1 && similarity2 > 0.995) + return processChapterVariants(reference, candidate, similarity1.toFloat()) // Very little difference between cleaned up and original version // => not a chapter variant if (similarity2 - similarity1 < 0.01) { similarity1.toFloat() - } else { - 0f // Most probably a chapter variant -> set to 0% + } else { // Most probably a chapter variant + return processChapterVariants(reference, candidate, similarity1.toFloat()) } } else { 0f // Below threshold @@ -282,10 +283,43 @@ class DuplicateHelper { } else return if (similarity1 >= TEXT_THRESHOLDS[sensitivity]) similarity1.toFloat() else 0f } - fun sanitizeTitle(title: String): String { + private fun processChapterVariants( + reference: DuplicateCandidate, + candidate: DuplicateCandidate, + similarity: Float + ): Float { + // No numbers to compare (e.g. "gaiden" / "ex") + if (-1 == reference.maxChapterBound || -1 == candidate.maxChapterBound) return 0f + + // Chapter numbers overlap (two variants) => don't ignore it, that's an actual duplicate + if (reference.minChapterBound >= candidate.minChapterBound && reference.minChapterBound <= candidate.maxChapterBound) return similarity + if (candidate.minChapterBound >= reference.minChapterBound && candidate.minChapterBound <= reference.maxChapterBound) return similarity + + return 0f + } + + fun sanitizeTitle(title: String): Triple { + // Compute min and max chapter value + // These are to be : + // - Located in the last 20% of the title + // - Separated by at most 4 characters + var minChapter: ImmutableTriple? = null + var maxChapter: ImmutableTriple? = null + val digitsMap = StringHelper.locateDigits(title).reversed() + digitsMap.forEach { + if (it.middle >= title.length * 0.8 && null == maxChapter) maxChapter = it + else maxChapter?.let { max -> + if (it.middle >= max.left - 5) minChapter = it + } + } + if (maxChapter != null && null == minChapter) minChapter = maxChapter + val minChapterValue = if (minChapter != null) minChapter!!.right else -1 + val maxChapterValue = if (maxChapter != null) maxChapter!!.right else -1 + + // Sanitize the title var result = StringHelper.removeDigits(title) for (s in TITLE_CHAPTER_WORDS) result = result.replace(s, "") - return result + return Triple(result, minChapterValue, maxChapterValue) } private fun computeArtistScore( @@ -311,21 +345,36 @@ class DuplicateHelper { useArtist: Boolean, useLanguage: Boolean, useCover: Boolean, + ignoreChapters: Boolean, forceCoverHash: Long = Long.MIN_VALUE ) { val id = content.id val coverHash = if (!useCover) Long.MIN_VALUE else if (Long.MIN_VALUE == forceCoverHash) content.cover.imageHash else forceCoverHash val size = content.size - val titleCleanup = (if (useTitle) StringHelper.cleanup(content.title) else "")!! - val titleNoDigits = if (useTitle) sanitizeTitle(titleCleanup) else "" + val titleCleanup: String = if (useTitle) StringHelper.cleanup(content.title) else "" val artistsCleanup: List? = - if (useArtist) content.attributeMap[AttributeType.ARTIST]?.map { it -> + if (useArtist) content.attributeMap[AttributeType.ARTIST]?.map { StringHelper.cleanup(it.name) } else Collections.emptyList() val countryCodes = if (useLanguage) content.attributeMap[AttributeType.LANGUAGE]?.map { LanguageHelper.getCountryCodeFromLanguage(it.name) } else Collections.emptyList() - } + val titleNoDigits: String + val minChapterBound: Int + val maxChapterBound: Int + init { + if (useTitle && ignoreChapters) { + val sanitizeResult = sanitizeTitle(titleCleanup) + titleNoDigits = sanitizeResult.first + minChapterBound = sanitizeResult.second + maxChapterBound = sanitizeResult.third + } else { + titleNoDigits = "" + minChapterBound = -1 + maxChapterBound = -1 + } + } + } } \ No newline at end of file diff --git a/app/src/main/java/me/devsaki/hentoid/util/StringHelper.java b/app/src/main/java/me/devsaki/hentoid/util/StringHelper.java index 2b3f68c406..72dbb69ce3 100644 --- a/app/src/main/java/me/devsaki/hentoid/util/StringHelper.java +++ b/app/src/main/java/me/devsaki/hentoid/util/StringHelper.java @@ -5,8 +5,10 @@ import com.annimon.stream.Stream; +import org.apache.commons.lang3.tuple.ImmutableTriple; import org.apache.commons.text.StringEscapeUtils; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; @@ -193,6 +195,29 @@ public static String keepDigits(@NonNull final String s) { return result.toString().trim(); } + // TODO doc + public static List> locateDigits(@NonNull final String s) { + List> result = new ArrayList<>(); + boolean inDigit = false; + int startIndex = -1; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (Character.isDigit(c) && !inDigit) { + startIndex = i; + inDigit = true; + } else if (!Character.isDigit(c) && inDigit) { + int value = Integer.parseInt(s.substring(startIndex, i)); + result.add(new ImmutableTriple<>(startIndex, i - 1, value)); + inDigit = false; + } + } + if (inDigit) { + int value = Integer.parseInt(s.substring(startIndex)); + result.add(new ImmutableTriple<>(startIndex, s.length() - 1, value)); + } + return result; + } + /** * Remove any multiple spaces from the given string to replace them with a single space * NB1 : This methods is a fast alternative to using Regexes to replace \s by ' ' diff --git a/app/src/main/java/me/devsaki/hentoid/workers/DuplicateDetectorWorker.java b/app/src/main/java/me/devsaki/hentoid/workers/DuplicateDetectorWorker.java index 57b33206bd..2cf4f05a1c 100644 --- a/app/src/main/java/me/devsaki/hentoid/workers/DuplicateDetectorWorker.java +++ b/app/src/main/java/me/devsaki/hentoid/workers/DuplicateDetectorWorker.java @@ -143,7 +143,7 @@ private void detectDuplicates( // Pre-compute all book entries as DuplicateCandidates List candidates = new ArrayList<>(); dao.streamStoredContent(false, false, Preferences.Constant.ORDER_FIELD_SIZE, true, - content -> candidates.add(new DuplicateHelper.DuplicateCandidate(content, useTitle, useArtist, useSameLanguage, useCover, Long.MIN_VALUE))); + content -> candidates.add(new DuplicateHelper.DuplicateCandidate(content, useTitle, useArtist, useSameLanguage, useCover, ignoreChapters, Long.MIN_VALUE))); trace(Log.DEBUG, "Detection started for %d books", candidates.size()); processAll( diff --git a/app/src/test/java/me/devsaki/hentoid/util/TextDupeDetectorTest.java b/app/src/test/java/me/devsaki/hentoid/util/TextDupeDetectorTest.java index e8f264be38..4ecd751278 100644 --- a/app/src/test/java/me/devsaki/hentoid/util/TextDupeDetectorTest.java +++ b/app/src/test/java/me/devsaki/hentoid/util/TextDupeDetectorTest.java @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.List; +import me.devsaki.hentoid.database.domains.Content; import me.devsaki.hentoid.util.string_similarity.Cosine; import timber.log.Timber; @@ -49,25 +50,42 @@ public void displayDistances() { } Assert.assertFalse(vals1.isEmpty()); + System.out.printf("%d lines loaded\n", vals1.size()); + + float tolerance = 0.01f; + int sensitivity = 0; // 0=permissive; 2=strict + boolean ignoreChapters = true; Cosine c = new Cosine(); for (String s1 : vals1) { - String s1c = StringHelper.cleanup(s1); - String s1cp = DuplicateHelper.Companion.sanitizeTitle(s1c); + Content c1 = new Content().setTitle(s1); + DuplicateHelper.DuplicateCandidate dc1 = new DuplicateHelper.DuplicateCandidate(c1, true, false, false, false, ignoreChapters, Long.MIN_VALUE); + //String s1c = StringHelper.cleanup(s1); + //Triple s1cp = DuplicateHelper.Companion.sanitizeTitle(s1c); for (String s2 : vals1) { - if (s1 == s2) continue; - String s2c = StringHelper.cleanup(s2); - String s2cp = DuplicateHelper.Companion.sanitizeTitle(s2c); - double score = DuplicateHelper.Companion.computeTitleScore(c, s1c, s1cp, s2c, s2cp, true, 0); + //if (s1 == s2) break; + //noinspection StringEquality + if (s1 == s2) continue; // Test _both_ combinations + + Content c2 = new Content().setTitle(s2); + DuplicateHelper.DuplicateCandidate dc2 = new DuplicateHelper.DuplicateCandidate(c2, true, false, false, false, ignoreChapters, Long.MIN_VALUE); + //String s2c = StringHelper.cleanup(s2); + //Triple s2cp = DuplicateHelper.Companion.sanitizeTitle(s2c); + double score = DuplicateHelper.Companion.computeTitleScore(c, dc1, dc2, ignoreChapters, sensitivity); if (score > 0) { - double similarity1 = c.similarity(s1c, s2c); - double similarity2 = c.similarity(s1cp, s2cp); + System.out.printf("[%.4f] %s > %s\n", score, dc1.getTitleCleanup(), dc2.getTitleCleanup()); + /* + double similarity1 = c.similarity(dc1.getTitleCleanup(), dc2.getTitleCleanup()); + double similarity2 = c.similarity(dc1.getTitleNoDigits(), dc2.getTitleNoDigits()); double distance = similarity2 - similarity1; - System.out.printf("%s %s [%.4f - %.4f = %.4f ==> %.4f] %s%n", (distance < 0.01) ? "MATCH " : "CHAPTER ", s1c, similarity1, similarity2, distance, score, s2c); - if (distance < 0.01) - System.out.printf("%s > %s%n", s1cp, s2cp); + System.out.printf("%s %s [%.4f - %.4f = %.4f ==> %.4f] %s%n", (distance < tolerance) ? "MATCH " : "CHAPTER ", dc1.getTitleCleanup(), similarity1, similarity2, distance, score, dc2.getTitleCleanup()); + if (distance < tolerance) + System.out.printf("%s > %s%n", dc1.getTitleNoDigits(), dc2.getTitleNoDigits()); + */ } } } + System.out.print("Done\n"); + System.out.flush(); } } \ No newline at end of file