-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
266 lines (224 loc) · 11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
from __future__ import with_statement
import argparse
import contextlib
import operator
import os
import re
import threading
import time
lock = threading.Lock()
threads = {}
paths = {}
sizes = {}
# mounted_filesystems = ['/', '/media/home', '/media/UserData', '/media/win7backup', '/media/4tb-ext', '/media/pdavies/SeagateV2', '/media/pdavies/Hitachi']
# mounted_filesystems = ['/media/4tb-ext', '/media/pdavies/Hitachi']
subtree_visit_num = 0
no_access = []
not_found = []
not_folder = []
units = {"B": 1, "KB": 10 ** 3, "MB": 10 ** 6, "GB": 10 ** 9, "TB": 10 ** 12, "K": 10 ** 3, "M": 10 ** 6, "G": 10 ** 9, "T": 10 ** 12}
def threadName():
return threads[threading.get_ident()].getName()
class Args:
def __init__(self):
parser = argparse.ArgumentParser('Find duplicate folders in filesystem')
# parser.add_argument('infile', metavar='input file', nargs=1, help='File to process')
parser.add_argument('-e', '--exclude', default=None, help='Comma separated list of root-level folders to exclude')
parser.add_argument('-m', '--minsize', default='5G', help='Smallest size to care about')
parser.add_argument('-x', '--exclude-subfolders', default='/media/pdavies/Hitachi/timeshift,/proc,/run', help='Folders to exclude from search')
parser_args = parser.parse_args()
# print(parser_args)
if parser_args.exclude:
self.exclude = parser_args.exclude.split(',')
else:
self.exclude = []
if parser_args.exclude_subfolders:
self.exclude_subfolders = parser_args.exclude_subfolders.split(',')
else:
self.exclude_subfolders = []
self.minsize = Args.parse_size(parser_args.minsize)
@staticmethod
def parse_size(size):
# print('Parse', size)
size = size.upper()
if not size.endswith('B'):
size += 'B'
if not re.match(r' ', size):
size = re.sub(r'([KMGT]?B)', r' \1', size)
number, unit = [string.strip() for string in size.split()]
return int(float(number) * units[unit])
args = Args()
print('Min folder size', args.minsize)
print('Exclude root folders', args.exclude)
exclude_folders = args.exclude
exclude_subfolders = args.exclude_subfolders
print("Filesystem\tMounted on\tUse%\tIUse%")
mounted_filesystems = {}
nonroot_filesystems = []
with contextlib.closing(open('/etc/mtab')) as fp:
for m in fp:
fs_spec, fs_file, fs_vfstype, fs_mntops, fs_freq, fs_passno = m.split()
if fs_spec.startswith('/') and '/loop' not in fs_spec:
try:
r = os.statvfs(fs_file)
if r.f_files > 0:
block_usage_pct = 100.0 - (float(r.f_bavail) / float(r.f_blocks) * 100)
inode_usage_pct = 100.0 - (float(r.f_favail) / float(r.f_files) * 100)
if fs_file in exclude_folders:
print('Exclude drive folder', fs_file)
else:
if fs_spec not in mounted_filesystems:
mounted_filesystems[fs_spec] = fs_file
if fs_file != '/':
nonroot_filesystems.append(fs_file)
print("%s\t%s\t\t%d%%\t%d%%" % (fs_spec, fs_file, block_usage_pct, inode_usage_pct))
except PermissionError:
pass
def isChild(p1, p2):
return p1.startswith(p2) and p1 != p2
class Duplicate:
def __init__(self, size, path1, path2):
self.size = size
if path1 > path2:
self.__path1 = path1
self.__path2 = path2
else:
self.__path1 = path1
self.__path2 = path2
def __str__(self):
return '%d, <%s> and <%s>' % (self.size, self.__path1, self.__path2)
def getKey(self):
return self.__path1 + '+' + self.__path2
def anyendswith(self, other_dupe, string):
return self.__path1.endswith(string) or self.__path2.endswith(string) or other_dupe.__path1.endswith(string) or other_dupe.__path2.endswith(string)
def isChildOf(self, other_dupe):
# Either path1 or path2 of either dupe might be a different device, so a different root, but they must have a root in common
# so if we compare all permutations we should catch it
# Comparison is a,b with c,d so a child would mean 'a' is a child of 'c' and 'b' of 'd' OR
# 'a' of 'd' and 'b' of 'c'
# if self.anyendswith(other_dupe, 'Win7Pro-disk3.vmdk'):
# print('Yeah')
we_are_child = isChild(self.__path1, other_dupe.__path1) and isChild(self.__path2, other_dupe.__path2)
we_are_child = we_are_child or (isChild(self.__path1, other_dupe.__path2) and isChild(self.__path2, other_dupe.__path1))
return we_are_child
duplicates = []
def searchTree(path):
# What if here we treat files and folders uniformly?
global subtree_visit_num
if path in exclude_subfolders:
print('Path is in subfolders to exclude:', path)
return 0, 0
if threadName() == '/':
# print('root')
if any(substring in path for substring in nonroot_filesystems):
print("Skip %s whilst searcing '/', it's on another drive" % path)
return 0, 0
total_size = 0
digest = 0
subtree_visit_num += 1
if subtree_visit_num % 100000 == 0:
# if len(path) > 100:
# vp = path[0:49] + '...' + path[-49:]
# else:
# vp = path
print('Visiting subtree #%d. Paths added: %d. Duplicates: %d. %s' % (subtree_visit_num, len(paths), len(duplicates), threadName()))
# At this level we're assuming this is a folder, and we've ensured we're only calling this for folders
try:
# Examine every file and folder at this level
with os.scandir(path) as dir_entries:
for dir_entry in dir_entries:
# Ignore symbolic links
if 'smb-share:server' in dir_entry.path:
continue
if dir_entry.is_symlink():
continue
basename = os.path.basename(dir_entry.path)
filedigest = hash(basename)
digest += filedigest
if dir_entry.is_file():
filesize = os.path.getsize(dir_entry.path)
total_size += filesize
# Also add in the file size as an extra indicator of similarity when comparing with cousins
digest += filesize
# However, if this file itself is huge, it earns its own entry
if filesize > args.minsize:
file_key = filedigest + filesize
filepath = dir_entry.path
# print('Effing great file', filepath, threadName())
if file_key in paths:
# The key already has an entry, but the paths really really should be different!
if filepath == paths[file_key]:
print('Problem!', filepath, paths[file_key], dir_entry.is_symlink(), threadName())
exit(3)
new_dupe = Duplicate(filesize, filepath, paths[file_key])
# With folders we need to check for children so we can collapse a subtree. We have no such issue here
# because files hav no children. We do need later folder checks to recognize these file entries as children
# and remove them when dupe candidates are found which are parents of this file here. This should work since
# it's done by path, and files and folders use the same syntax
duplicates.append(new_dupe)
else:
paths[file_key] = filepath
sizes[file_key] = filesize
else:
folder_digest, folder_size = searchTree(dir_entry.path)
digest += folder_digest
total_size += folder_size
if total_size > args.minsize and digest != 0:
with lock:
folder_key = digest
if folder_key in paths:
# If either path of a duplicate wholly contains either path of another duplicate, then it's a subtree
new_dupe = Duplicate(total_size, path, paths[folder_key])
orphan = True
children = {}
for already_registered_dupe in duplicates:
if new_dupe.isChildOf(already_registered_dupe):
# We have a parent, so our existence is pointless
# assert not children, 'We have a child and a parent. How did the child survive here without the parent already deleting it?'
# print('We have a parent')
orphan = False
if already_registered_dupe.isChildOf(new_dupe):
# We're the parent of an existing duplicate, so that child is now useless and should be removed
# assert orphan, 'We have a parent and a child. How did the child survive here without the parent already deleting it?'
# print('We have a child')
if already_registered_dupe.getKey() not in children:
children[already_registered_dupe.getKey()] = already_registered_dupe
if orphan:
duplicates.append(new_dupe)
for child in children.values():
duplicates.remove(child)
else:
drive_threads = []
for th in threads.values():
if th.is_alive():
drive_threads.append(th.name)
paths[digest] = path
sizes[digest] = total_size
except PermissionError:
no_access.append(path)
except FileNotFoundError:
not_found.append(path)
except NotADirectoryError:
not_folder.append(path)
return digest, total_size
drive_timings = []
def timedSearchTree(mounted_fs):
threads[threading.get_ident()] = x
start = time.time()
searchTree(mounted_fs)
end = time.time()
drive_timings.append('Scanning drive %s took %d seconds' % (mounted_fs, end - start))
if __name__ == "__main__":
for mounted_filesystem in mounted_filesystems.values():
x = threading.Thread(target=timedSearchTree, args=(mounted_filesystem,), daemon=True, name=mounted_filesystem)
x.start()
for thread in threads.values():
thread.join()
sorted_duplicates = sorted(duplicates, key=operator.attrgetter('size'), reverse=True)
print('Candidate duplicates:')
for duplicate in sorted_duplicates:
print(duplicate)
print('----')
for drive_timing in drive_timings:
print(drive_timing)
print('...done')