Skip to content

Commit

Permalink
Duplicate detector : Detect overlapping chapters as duplicates even w…
Browse files Browse the repository at this point in the history
…hen "ignore chapters" is set [#834]
  • Loading branch information
RobbWatershed committed Feb 18, 2023
1 parent 4f2a318 commit c2caf25
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 34 deletions.
4 changes: 2 additions & 2 deletions app/src/main/java/me/devsaki/hentoid/util/ContentHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -1537,8 +1537,8 @@ public static ImmutablePair<Content, Float> findDuplicate(@NonNull final Context
// Refine by running the actual duplicate detection algorithm against the rough candidates
List<DuplicateEntry> entries = new ArrayList<>();
StringSimilarity cosine = new Cosine();
DuplicateHelper.DuplicateCandidate reference = new DuplicateHelper.DuplicateCandidate(content, useTitle, useArtist, useLanguage, useCover, pHash);
List<DuplicateHelper.DuplicateCandidate> candidates = Stream.of(roughCandidates).map(c -> new DuplicateHelper.DuplicateCandidate(c, useTitle, useArtist, useLanguage, useCover, Long.MIN_VALUE)).toList();
DuplicateHelper.DuplicateCandidate reference = new DuplicateHelper.DuplicateCandidate(content, useTitle, useArtist, useLanguage, useCover, true, pHash);
List<DuplicateHelper.DuplicateCandidate> candidates = Stream.of(roughCandidates).map(c -> new DuplicateHelper.DuplicateCandidate(c, useTitle, useArtist, useLanguage, useCover, true, Long.MIN_VALUE)).toList();
for (DuplicateHelper.DuplicateCandidate candidate : candidates) {
DuplicateEntry entry = DuplicateHelper.Companion.processContent(reference, candidate, useTitle, useCover, useArtist, useLanguage, true, 2, cosine);
if (entry != null) entries.add(entry);
Expand Down
89 changes: 69 additions & 20 deletions app/src/main/java/me/devsaki/hentoid/util/DuplicateHelper.kt
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ import me.devsaki.hentoid.util.file.FileHelper
import me.devsaki.hentoid.util.image.ImageHelper
import me.devsaki.hentoid.util.image.ImagePHash
import me.devsaki.hentoid.util.string_similarity.StringSimilarity
import org.apache.commons.lang3.tuple.ImmutableTriple
import timber.log.Timber
import java.io.IOException
import java.io.InputStream
import java.util.*
import java.util.Collections
import java.util.concurrent.atomic.AtomicBoolean

class DuplicateHelper {
Expand Down Expand Up @@ -155,7 +156,8 @@ class DuplicateHelper {
try {
// Update the book JSON if the book folder still exists
if (content.storageUri.isNotEmpty()) {
val folder = FileHelper.getDocumentFromTreeUriString(context, content.storageUri)
val folder =
FileHelper.getDocumentFromTreeUriString(context, content.storageUri)
if (folder != null) {
if (content.jsonUri.isNotEmpty()) ContentHelper.updateJson(
context,
Expand Down Expand Up @@ -200,8 +202,8 @@ class DuplicateHelper {
}
if (useTitle) titleScore = computeTitleScore(
textComparator,
reference.titleCleanup, reference.titleNoDigits,
candidate.titleCleanup, candidate.titleNoDigits,
reference,
candidate,
ignoreChapters,
sensitivity
)
Expand Down Expand Up @@ -250,42 +252,74 @@ class DuplicateHelper {

fun computeTitleScore(
textComparator: StringSimilarity,
referenceTitleCleanup: String,
referenceTitleNoDigits: String,
candidateTitleCleanup: String,
candidateTitleNoDigits: String,
reference: DuplicateCandidate,
candidate: DuplicateCandidate,
ignoreChapters: Boolean,
sensitivity: Int
): Float {
val similarity1 =
textComparator.similarity(referenceTitleCleanup, candidateTitleCleanup)
textComparator.similarity(reference.titleCleanup, candidate.titleCleanup)
if (ignoreChapters) {
// Perfect match
if (similarity1 > 0.995) return similarity1.toFloat()
// Other cases : check if both titles are chapters or sequels
return if (similarity1 > TEXT_THRESHOLDS[sensitivity]) {
val similarity2 =
textComparator.similarity(referenceTitleNoDigits, candidateTitleNoDigits)
textComparator.similarity(reference.titleNoDigits, candidate.titleNoDigits)
// Cleaned up versions are identical
// => most probably a chapter variant -> set to 0%
if (similarity2 > similarity1 && similarity2 > 0.995) return 0f
// => most probably a chapter variant
if (similarity2 > similarity1 && similarity2 > 0.995)
return processChapterVariants(reference, candidate, similarity1.toFloat())
// Very little difference between cleaned up and original version
// => not a chapter variant
if (similarity2 - similarity1 < 0.01) {
similarity1.toFloat()
} else {
0f // Most probably a chapter variant -> set to 0%
} else { // Most probably a chapter variant
return processChapterVariants(reference, candidate, similarity1.toFloat())
}
} else {
0f // Below threshold
}
} else return if (similarity1 >= TEXT_THRESHOLDS[sensitivity]) similarity1.toFloat() else 0f
}

fun sanitizeTitle(title: String): String {
private fun processChapterVariants(
reference: DuplicateCandidate,
candidate: DuplicateCandidate,
similarity: Float
): Float {
// No numbers to compare (e.g. "gaiden" / "ex")
if (-1 == reference.maxChapterBound || -1 == candidate.maxChapterBound) return 0f

// Chapter numbers overlap (two variants) => don't ignore it, that's an actual duplicate
if (reference.minChapterBound >= candidate.minChapterBound && reference.minChapterBound <= candidate.maxChapterBound) return similarity
if (candidate.minChapterBound >= reference.minChapterBound && candidate.minChapterBound <= reference.maxChapterBound) return similarity

return 0f
}

fun sanitizeTitle(title: String): Triple<String, Int, Int> {
// Compute min and max chapter value
// These are to be :
// - Located in the last 20% of the title
// - Separated by at most 4 characters
var minChapter: ImmutableTriple<Int, Int, Int>? = null
var maxChapter: ImmutableTriple<Int, Int, Int>? = null
val digitsMap = StringHelper.locateDigits(title).reversed()
digitsMap.forEach {
if (it.middle >= title.length * 0.8 && null == maxChapter) maxChapter = it
else maxChapter?.let { max ->
if (it.middle >= max.left - 5) minChapter = it
}
}
if (maxChapter != null && null == minChapter) minChapter = maxChapter
val minChapterValue = if (minChapter != null) minChapter!!.right else -1
val maxChapterValue = if (maxChapter != null) maxChapter!!.right else -1

// Sanitize the title
var result = StringHelper.removeDigits(title)
for (s in TITLE_CHAPTER_WORDS) result = result.replace(s, "")
return result
return Triple(result, minChapterValue, maxChapterValue)
}

private fun computeArtistScore(
Expand All @@ -311,21 +345,36 @@ class DuplicateHelper {
useArtist: Boolean,
useLanguage: Boolean,
useCover: Boolean,
ignoreChapters: Boolean,
forceCoverHash: Long = Long.MIN_VALUE
) {
val id = content.id
val coverHash =
if (!useCover) Long.MIN_VALUE else if (Long.MIN_VALUE == forceCoverHash) content.cover.imageHash else forceCoverHash
val size = content.size
val titleCleanup = (if (useTitle) StringHelper.cleanup(content.title) else "")!!
val titleNoDigits = if (useTitle) sanitizeTitle(titleCleanup) else ""
val titleCleanup: String = if (useTitle) StringHelper.cleanup(content.title) else ""
val artistsCleanup: List<String>? =
if (useArtist) content.attributeMap[AttributeType.ARTIST]?.map { it ->
if (useArtist) content.attributeMap[AttributeType.ARTIST]?.map {
StringHelper.cleanup(it.name)
} else Collections.emptyList()
val countryCodes = if (useLanguage) content.attributeMap[AttributeType.LANGUAGE]?.map {
LanguageHelper.getCountryCodeFromLanguage(it.name)
} else Collections.emptyList()
}
val titleNoDigits: String
val minChapterBound: Int
val maxChapterBound: Int

init {
if (useTitle && ignoreChapters) {
val sanitizeResult = sanitizeTitle(titleCleanup)
titleNoDigits = sanitizeResult.first
minChapterBound = sanitizeResult.second
maxChapterBound = sanitizeResult.third
} else {
titleNoDigits = ""
minChapterBound = -1
maxChapterBound = -1
}
}
}
}
25 changes: 25 additions & 0 deletions app/src/main/java/me/devsaki/hentoid/util/StringHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@

import com.annimon.stream.Stream;

import org.apache.commons.lang3.tuple.ImmutableTriple;
import org.apache.commons.text.StringEscapeUtils;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
Expand Down Expand Up @@ -193,6 +195,29 @@ public static String keepDigits(@NonNull final String s) {
return result.toString().trim();
}

// TODO doc
public static List<ImmutableTriple<Integer, Integer, Integer>> locateDigits(@NonNull final String s) {
List<ImmutableTriple<Integer, Integer, Integer>> result = new ArrayList<>();
boolean inDigit = false;
int startIndex = -1;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (Character.isDigit(c) && !inDigit) {
startIndex = i;
inDigit = true;
} else if (!Character.isDigit(c) && inDigit) {
int value = Integer.parseInt(s.substring(startIndex, i));
result.add(new ImmutableTriple<>(startIndex, i - 1, value));
inDigit = false;
}
}
if (inDigit) {
int value = Integer.parseInt(s.substring(startIndex));
result.add(new ImmutableTriple<>(startIndex, s.length() - 1, value));
}
return result;
}

/**
* Remove any multiple spaces from the given string to replace them with a single space
* NB1 : This methods is a fast alternative to using Regexes to replace \s by ' '
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ private void detectDuplicates(
// Pre-compute all book entries as DuplicateCandidates
List<DuplicateHelper.DuplicateCandidate> candidates = new ArrayList<>();
dao.streamStoredContent(false, false, Preferences.Constant.ORDER_FIELD_SIZE, true,
content -> candidates.add(new DuplicateHelper.DuplicateCandidate(content, useTitle, useArtist, useSameLanguage, useCover, Long.MIN_VALUE)));
content -> candidates.add(new DuplicateHelper.DuplicateCandidate(content, useTitle, useArtist, useSameLanguage, useCover, ignoreChapters, Long.MIN_VALUE)));

trace(Log.DEBUG, "Detection started for %d books", candidates.size());
processAll(
Expand Down
40 changes: 29 additions & 11 deletions app/src/test/java/me/devsaki/hentoid/util/TextDupeDetectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import java.util.ArrayList;
import java.util.List;

import me.devsaki.hentoid.database.domains.Content;
import me.devsaki.hentoid.util.string_similarity.Cosine;
import timber.log.Timber;

Expand Down Expand Up @@ -49,25 +50,42 @@ public void displayDistances() {
}

Assert.assertFalse(vals1.isEmpty());
System.out.printf("%d lines loaded\n", vals1.size());

float tolerance = 0.01f;
int sensitivity = 0; // 0=permissive; 2=strict
boolean ignoreChapters = true;

Cosine c = new Cosine();
for (String s1 : vals1) {
String s1c = StringHelper.cleanup(s1);
String s1cp = DuplicateHelper.Companion.sanitizeTitle(s1c);
Content c1 = new Content().setTitle(s1);
DuplicateHelper.DuplicateCandidate dc1 = new DuplicateHelper.DuplicateCandidate(c1, true, false, false, false, ignoreChapters, Long.MIN_VALUE);
//String s1c = StringHelper.cleanup(s1);
//Triple<String, Integer, Integer> s1cp = DuplicateHelper.Companion.sanitizeTitle(s1c);
for (String s2 : vals1) {
if (s1 == s2) continue;
String s2c = StringHelper.cleanup(s2);
String s2cp = DuplicateHelper.Companion.sanitizeTitle(s2c);
double score = DuplicateHelper.Companion.computeTitleScore(c, s1c, s1cp, s2c, s2cp, true, 0);
//if (s1 == s2) break;
//noinspection StringEquality
if (s1 == s2) continue; // Test _both_ combinations

Content c2 = new Content().setTitle(s2);
DuplicateHelper.DuplicateCandidate dc2 = new DuplicateHelper.DuplicateCandidate(c2, true, false, false, false, ignoreChapters, Long.MIN_VALUE);
//String s2c = StringHelper.cleanup(s2);
//Triple<String, Integer, Integer> s2cp = DuplicateHelper.Companion.sanitizeTitle(s2c);
double score = DuplicateHelper.Companion.computeTitleScore(c, dc1, dc2, ignoreChapters, sensitivity);
if (score > 0) {
double similarity1 = c.similarity(s1c, s2c);
double similarity2 = c.similarity(s1cp, s2cp);
System.out.printf("[%.4f] %s > %s\n", score, dc1.getTitleCleanup(), dc2.getTitleCleanup());
/*
double similarity1 = c.similarity(dc1.getTitleCleanup(), dc2.getTitleCleanup());
double similarity2 = c.similarity(dc1.getTitleNoDigits(), dc2.getTitleNoDigits());
double distance = similarity2 - similarity1;
System.out.printf("%s %s [%.4f - %.4f = %.4f ==> %.4f] %s%n", (distance < 0.01) ? "MATCH " : "CHAPTER ", s1c, similarity1, similarity2, distance, score, s2c);
if (distance < 0.01)
System.out.printf("%s > %s%n", s1cp, s2cp);
System.out.printf("%s %s [%.4f - %.4f = %.4f ==> %.4f] %s%n", (distance < tolerance) ? "MATCH " : "CHAPTER ", dc1.getTitleCleanup(), similarity1, similarity2, distance, score, dc2.getTitleCleanup());
if (distance < tolerance)
System.out.printf("%s > %s%n", dc1.getTitleNoDigits(), dc2.getTitleNoDigits());
*/
}
}
}
System.out.print("Done\n");
System.out.flush();
}
}

0 comments on commit c2caf25

Please sign in to comment.