-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleanup.py
168 lines (135 loc) · 4.82 KB
/
cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Prepares passphrase cracking lists for use with the hashcat rules at
github.com/initstring/passphrase-wordlist
"""
import sys
import re
import urllib.parse
import html
import os
import time
import argparse
from datetime import timedelta
# Set a min/max passphrase character length. Change this if you want.
MIN_LENGTH = 8
MAX_LENGTH = 40
def parse_arguments():
"""
Handles user-passed parameters
"""
desc = 'Transforms text files in passphrase lists.'
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('infile', type=str, action='store',
help='Input file.')
parser.add_argument('outfile', type=str, action='store',
help='Output file.')
args = parser.parse_args()
if not os.access(args.infile, os.R_OK):
print("[!] Cannot access input file, exiting")
sys.exit()
return args
def build_buffer(infile):
"""
Reads infile and builds a list of candidates for additional processing
"""
buffer = []
infile_size = str((int(os.path.getsize(infile)/1000000))) + " MB"
print("Reading from {}: {}".format(infile, infile_size))
with open(infile, encoding='utf-8', errors='ignore') as file_handler:
for line in file_handler:
candidates = []
# Remove HTML and URL encoding first
line = escape_encoding(line)
# Split lines with common delimiters like . , or ;
for split_line in re.split(r';|,|\.', line):
candidates.append(split_line.strip())
# There is a new short list, append each to the buffer
for string in candidates:
buffer.append(string)
return buffer
def handle_punctuation(line):
"""
Deals with common punctionation
"""
clean_lines = []
# Allow only letters, numbers, spaces, and some punctuation
allowed_chars = re.compile("[^a-zA-Z0-9 '&]")
# Gets rid of any remaining special characters in the name
line = allowed_chars.sub('', line)
# Shrinks down multiple spaces
line = re.sub(r'\s\s+', ' ', line)
# If line has an apostrophe make a duplicate without
if "'" in line:
clean_lines.append(re.sub("'", "", line))
# Making duplicating phrases including and / &
if ' and ' in line:
clean_lines.append(re.sub(' and ', ' & ', line))
if '&' in line:
newline = re.sub('&', ' and ', line)
newline = re.sub(r'\s+', ' ', newline).strip()
clean_lines.append(newline)
# Add what is left to the list and return it
clean_lines.append(line)
return clean_lines
def escape_encoding(line):
"""
Deals with common encoding and accented characters
"""
line = urllib.parse.unquote(line) # convert URL encoding like %27
line = html.unescape(line) # convert HTML encoding like '
line = re.sub(r'\s+', ' ', line).strip() # Remove extra whitespace
line = line.lower() # convert to lowercase
line = re.sub(r'[-_]', ' ', line) # Change - and _ to spaces
# The following lines attempt to remove accented characters, as the
# tool is focused on Engligh-language passwords.
line = re.sub('[àáâãäå]', 'a', line)
line = re.sub('[èéêë]', 'e', line)
line = re.sub('[ìíîï]', 'i', line)
line = re.sub('[òóôõö]', 'o', line)
line = re.sub('[ùúûü]', 'u', line)
line = re.sub('[ñ]', 'n', line)
return line
def choose_candidates(line):
"""
Final check to determine with cleaned phrases to keep
"""
match = re.compile('[a-z0-9\'&] [a-z0-9\'&]')
# Throw out single-word candidates
if not match.search(line):
return False
# Thow out too short / too long lines
if len(line) < MIN_LENGTH or len(line) > MAX_LENGTH:
return False
return True
def write_file(buffer, outfile):
"""
Writes choses candidates to an output file
"""
file_handler = open(outfile, 'w')
for line in buffer:
file_handler.write(line.strip()+ '\n')
file_handler.close()
outfile_size = str((int(os.path.getsize(outfile)/1000000)))
print("Wrote to {}: {} MB".format(outfile, outfile_size))
def main():
"""
Main program function
"""
start = time.time()
args = parse_arguments()
buffer = build_buffer(args.infile)
final = set([])
# Processes phrases and adds to a set (deduped)
for phrase in buffer:
new_phrases = handle_punctuation(phrase)
for newphrase in new_phrases:
if choose_candidates(newphrase):
final.add(newphrase)
# Writes final set out to file
write_file(final, args.outfile)
elapsed = (time.time() - start)
print("Elapsed time: " + str(timedelta(seconds=elapsed)))
if __name__ == "__main__":
main()