Skip to content

Commit

Permalink
fix the issue of ratio greater than 100% (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
xyb authored Nov 12, 2022
1 parent 4e3ddb8 commit f9c1feb
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 27 deletions.
73 changes: 55 additions & 18 deletions chunkdup/chunkdup.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
#!/usr/bin/env python
import sys
from difflib import SequenceMatcher

from chunksum.parser import parse_chunksums


class CheckSumIndex:
def __init__(self, sums):
self._files = {} # file path -> file id
self._files = {} # file path -> file id and details
self._file_ids = {} # inverse: file id -> file path
self._chunk2file_id = {} # hash -> file id
self._chunk2size = {} # hash -> length of chunk
self._file_id2chunk = {} # file id -> hash
file_id = 0
for s in sums:
Expand All @@ -19,10 +21,11 @@ def __init__(self, sums):
size=sum([size for _, size in s["chunks"]]),
)
self._file_ids[file_id] = s["path"]
cids = [c for c, _ in s["chunks"]]
for c in cids:
self._file_id2chunk[file_id] = []
for c, size in s["chunks"]:
self._chunk2file_id.setdefault(c, []).append(file_id)
self._file_id2chunk[file_id] = cids
self._chunk2size[c] = size
self._file_id2chunk[file_id].append(c)
file_id += 1

@property
Expand All @@ -45,6 +48,33 @@ def chunk2fileid(self):
def fileid2chunk(self):
return self._file_id2chunk

@property
def chunk2size(self):
return self._chunk2size


def diff_ratio(a, b, sizes1, sizes2):
"""
>>> sizes = {'a': 10, 'b': 10, 'c': 20}
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'], sizes, sizes)
1.0
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'a', 'b', 'a'], sizes, sizes)
0.75
>>> diff_ratio(['a', 'a', 'a', 'a'], ['a', 'c', 'a'], sizes, sizes)
0.5
"""
matches = 0
for tag, i1, i2, _, _ in SequenceMatcher(a=a, b=b).get_opcodes():
if tag != "equal":
continue
for chunk in a[i1:i2]:
size = sizes1.get(chunk, 0) or sizes2.get(chunk, 0)
matches += size
size1 = sum([sizes1.get(chunk) for chunk in a])
size2 = sum([sizes2.get(chunk) for chunk in b])
ratio = (2 * matches) / (size1 + size2)
return ratio


def find_dup_files(index1, index2):
chunks1 = index1.chunk2fileid
Expand All @@ -64,27 +94,23 @@ def find_dup_files(index1, index2):
file_id_pairs.extend([(x, y) for x in ids1 for y in ids2])
file_id_pairs = list(set(file_id_pairs))

def dup_rate(file_id1, file_id2):
ids1 = fileids1[file_id1]
ids2 = fileids2[file_id2]
dup = len(ids1) + len(ids2) - len(set(ids1) | set(ids2))
rate = dup / max(len(ids1), len(ids2))
return rate

dups = []
for f1, f2 in file_id_pairs:
rate = dup_rate(f1, f2)
ids1 = fileids1[f1]
ids2 = fileids2[f2]
ratio = diff_ratio(ids1, ids2, index1.chunk2size, index2.chunk2size)
path1 = index1._file_ids[f1]
path2 = index2._file_ids[f2]
size1 = index1._files[path1]["size"]
size2 = index2._files[path2]["size"]
dups.append((rate, size1, path1, size2, path2))
dups.append((ratio, size1, path1, size2, path2))
return dups


def find_dup(chunksum_file1, chunksum_file2):
"""
>>> import io
>>> from pprint import pprint
>>> chunksum1 = '''
... sum1 /A/1 fck0sha2!a:10,b:10
... sum2 /A/2 fck0sha2!c:10,d:10,e:10
Expand All @@ -99,12 +125,16 @@ def find_dup(chunksum_file1, chunksum_file2):
... '''
>>> file1 = io.StringIO(chunksum1)
>>> file2 = io.StringIO(chunksum2)
>>> from pprint import pprint
>>> pprint(find_dup(file1, file2))
[(1.0, 10, '/A/4', 10, '/B/4'),
(0.6666666666666666, 30, '/A/2', 30, '/B/2'),
(0.5, 20, '/A/3', 20, '/B/3'),
(0.3333333333333333, 20, '/A/3', 30, '/B/2')]
(0.4, 20, '/A/3', 30, '/B/2')]
>>> chunksum_repeat = '''sum filename fck0sha2!a:1,a:1,a:1,b:2'''
>>> file1 = io.StringIO(chunksum_repeat)
>>> file2 = io.StringIO(chunksum_repeat)
>>> pprint(find_dup(file1, file2))
[(1.0, 5, 'filename', 5, 'filename')]
"""
sums1 = parse_chunksums(chunksum_file1)
sums2 = parse_chunksums(chunksum_file2)
Expand Down Expand Up @@ -136,12 +166,19 @@ def print_plain_report(dups, output_file):
100.00% /A/4 (10B) /B/4 (10B)
66.67% /A/2 (30B) /B/2 (30B)
50.00% /A/3 (20B) /B/3 (20B)
33.33% /A/3 (20B) /B/2 (30B)
40.00% /A/3 (20B) /B/2 (30B)
>>> chunksum_repeat = '''sum filename fck0sha2!a:1,a:1,a:1,b:2'''
>>> file1 = io.StringIO(chunksum_repeat)
>>> file2 = io.StringIO(chunksum_repeat)
>>> dups = find_dup(file1, file2)
>>> print_plain_report(dups, sys.stdout)
100.00% filename (5B) filename (5B)
"""
for rate, size1, file1, size2, file2 in dups:
for ratio, size1, file1, size2, file2 in dups:
print(
"{:>6.2f}% {} ({}B) {} ({}B)".format(
rate * 100,
ratio * 100,
file1,
size1,
file2,
Expand Down
18 changes: 9 additions & 9 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit f9c1feb

Please sign in to comment.