fix the issue of ratio greater than 100% (#2)

xyb · Nov 12, 2022 · f9c1feb · f9c1feb
1 parent 4e3ddb8
commit f9c1feb
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 27 deletions.
diff --git a/chunkdup/chunkdup.py b/chunkdup/chunkdup.py
@@ -1,14 +1,16 @@
 #!/usr/bin/env python
 import sys
+from difflib import SequenceMatcher
 
 from chunksum.parser import parse_chunksums
 
 
 class CheckSumIndex:
     def __init__(self, sums):
-        self._files = {}  # file path -> file id
+        self._files = {}  # file path -> file id and details
         self._file_ids = {}  # inverse: file id -> file path
         self._chunk2file_id = {}  # hash -> file id
+        self._chunk2size = {}  # hash -> length of chunk
         self._file_id2chunk = {}  # file id -> hash
         file_id = 0
         for s in sums:
@@ -19,10 +21,11 @@ def __init__(self, sums):
                 size=sum([size for _, size in s["chunks"]]),
             )
             self._file_ids[file_id] = s["path"]
-            cids = [c for c, _ in s["chunks"]]
-            for c in cids:
+            self._file_id2chunk[file_id] = []
+            for c, size in s["chunks"]:
                 self._chunk2file_id.setdefault(c, []).append(file_id)
-            self._file_id2chunk[file_id] = cids
+                self._chunk2size[c] = size
+                self._file_id2chunk[file_id].append(c)
             file_id += 1
 
     @property
@@ -45,6 +48,33 @@ def chunk2fileid(self):
     def fileid2chunk(self):
         return self._file_id2chunk
 
+    @property
+    def chunk2size(self):
+        return self._chunk2size
+
+
+def diff_ratio(a, b, sizes1, sizes2):
+    """
+    >>> sizes = {'a': 10, 'b': 10, 'c': 20}
+    >>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'], sizes, sizes)
+    1.0
+    >>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'b', 'a'], sizes, sizes)
+    0.75
+    >>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'c', 'a'], sizes, sizes)
+    0.5
+    """
+    matches = 0
+    for tag, i1, i2, _, _ in SequenceMatcher(a=a, b=b).get_opcodes():
+        if tag != "equal":
+            continue
+        for chunk in a[i1:i2]:
+            size = sizes1.get(chunk, 0) or sizes2.get(chunk, 0)
+            matches += size
+    size1 = sum([sizes1.get(chunk) for chunk in a])
+    size2 = sum([sizes2.get(chunk) for chunk in b])
+    ratio = (2 * matches) / (size1 + size2)
+    return ratio
+
 
 def find_dup_files(index1, index2):
     chunks1 = index1.chunk2fileid
@@ -64,27 +94,23 @@ def find_dup_files(index1, index2):
         file_id_pairs.extend([(x, y) for x in ids1 for y in ids2])
     file_id_pairs = list(set(file_id_pairs))
 
-    def dup_rate(file_id1, file_id2):
-        ids1 = fileids1[file_id1]
-        ids2 = fileids2[file_id2]
-        dup = len(ids1) + len(ids2) - len(set(ids1) | set(ids2))
-        rate = dup / max(len(ids1), len(ids2))
-        return rate
-
     dups = []
     for f1, f2 in file_id_pairs:
-        rate = dup_rate(f1, f2)
+        ids1 = fileids1[f1]
+        ids2 = fileids2[f2]
+        ratio = diff_ratio(ids1, ids2, index1.chunk2size, index2.chunk2size)
         path1 = index1._file_ids[f1]
         path2 = index2._file_ids[f2]
         size1 = index1._files[path1]["size"]
         size2 = index2._files[path2]["size"]
-        dups.append((rate, size1, path1, size2, path2))
+        dups.append((ratio, size1, path1, size2, path2))
     return dups
 
 
 def find_dup(chunksum_file1, chunksum_file2):
     """
     >>> import io
+    >>> from pprint import pprint
     >>> chunksum1 = '''
     ... sum1  /A/1  fck0sha2!a:10,b:10
     ... sum2  /A/2  fck0sha2!c:10,d:10,e:10
@@ -99,12 +125,16 @@ def find_dup(chunksum_file1, chunksum_file2):
     ... '''
     >>> file1 = io.StringIO(chunksum1)
     >>> file2 = io.StringIO(chunksum2)
-    >>> from pprint import pprint
     >>> pprint(find_dup(file1, file2))
     [(1.0, 10, '/A/4', 10, '/B/4'),
      (0.6666666666666666, 30, '/A/2', 30, '/B/2'),
      (0.5, 20, '/A/3', 20, '/B/3'),
-     (0.3333333333333333, 20, '/A/3', 30, '/B/2')]
+     (0.4, 20, '/A/3', 30, '/B/2')]
+    >>> chunksum_repeat = '''sum  filename  fck0sha2!a:1,a:1,a:1,b:2'''
+    >>> file1 = io.StringIO(chunksum_repeat)
+    >>> file2 = io.StringIO(chunksum_repeat)
+    >>> pprint(find_dup(file1, file2))
+    [(1.0, 5, 'filename', 5, 'filename')]
     """
     sums1 = parse_chunksums(chunksum_file1)
     sums2 = parse_chunksums(chunksum_file2)
@@ -136,12 +166,19 @@ def print_plain_report(dups, output_file):
     100.00%  /A/4 (10B)  /B/4 (10B)
      66.67%  /A/2 (30B)  /B/2 (30B)
      50.00%  /A/3 (20B)  /B/3 (20B)
-     33.33%  /A/3 (20B)  /B/2 (30B)
+     40.00%  /A/3 (20B)  /B/2 (30B)
+
+    >>> chunksum_repeat = '''sum  filename  fck0sha2!a:1,a:1,a:1,b:2'''
+    >>> file1 = io.StringIO(chunksum_repeat)
+    >>> file2 = io.StringIO(chunksum_repeat)
+    >>> dups = find_dup(file1, file2)
+    >>> print_plain_report(dups, sys.stdout)
+    100.00%  filename (5B)  filename (5B)
     """
-    for rate, size1, file1, size2, file2 in dups:
+    for ratio, size1, file1, size2, file2 in dups:
         print(
             "{:>6.2f}%  {} ({}B)  {} ({}B)".format(
-                rate * 100,
+                ratio * 100,
                 file1,
                 size1,
                 file2,

diff --git a/poetry.lock b/poetry.lock