-
Notifications
You must be signed in to change notification settings - Fork 0
/
performClustering.py
97 lines (82 loc) · 3.54 KB
/
performClustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pandas as pd #to create data frames
import tldextract #to extract domains, subdomains and tlds
from urllib.parse import urlparse #to get the path
from tabulate import tabulate
import requests,json
from Cluster import Cluster
from UrlComponents import UrlComponents
####The main function
def main():
df = pd.read_csv('PhishingCampaignClustering/bda_urls_hashtags_2.csv',sep = ',', quotechar='"', skipinitialspace=True)
urls = df.get("url")
domains = []; tlds = []; subDomains = []; paths = []
for url in urls:
extractResult = tldextract.extract(url)
domains.append(extractResult.domain)
tlds.append(extractResult.suffix)
subDomains.append(extractResult.subdomain)
paths.append(urlparse(url).path.strip())
#end for
df.insert(1, "tld" , tlds, True)
df.insert(2, "domain" , domains, True)
df.insert(3, "subdomain" , subDomains, True)
df.insert(4, "path" , paths, True)
df = df[df.isPhishing != 0]
theclusters = Cluster()
processData(df, theclusters)
#printClustersToAFile(theclusters)
df = pd.DataFrame([(k, o.sid, o.rd, o.url, o.tld)
for k, l in theclusters.clusters.items() for o in l],
columns=['key', 'sid', 'rd', 'url', 'tld']
)
analyseDiffTlds(df)
addWhoIsToDf(df)
with open('PhishingCampaignClustering/roughiana.txt', 'w') as f:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(tabulate(df[["key", "unq_tld_camp", "tld", "iana" ,"url"]], tablefmt="pipe", headers="keys"), file=f)
##todo - after whois part is done - analyse the data- get numbers and make a report
####End Function
##todo _ get started with getting the whois _ add the registrant to df
def addWhoIsToDf(df):
iana = [];
for index, row in df.iterrows():
iana.append(fetch_search_data(row['rd']))
df.insert(6, "iana" , iana, True)
####End Function
# getting the WHO is gata
def fetch_search_data(rd):
username = "usrarshdeep"
token = ''
api_url = "https://vmi935593.contaboserver.net/api/v1/get_whois_data.php"
params = {"token":token,"username":username, "rd":rd}
r = requests.post(api_url,params=params,timeout = 100)
if r.status_code == 200:
result = json.loads(r.text)['attr']
try:
return result[0]['iana']
except:
return 0
else: return 0
####End Function
# add the value of unique tld to original df
def analyseDiffTlds(df):
df['unq_tld_camp'] = df.groupby(["key"])["tld"].transform(lambda x: [x.nunique()]*len(x))
with open('PhishingCampaignClustering/roughtemp10&.txt', 'w') as f:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(tabulate(df[["key", "unq_tld_camp", "tld", "url"]], tablefmt="pipe", headers="keys"), file=f)
####End Function
def printClustersToAFile(theclusters):
with open('PhishingCampaignClustering/clustersFormed104.txt', 'w') as f:
for key in theclusters.clusters:
print(key, file=f)
for value in theclusters.clusters[key]:
print(" " + value.url, file=f)
####End Function
def processData(df, theclusters):
for index, row in df.iterrows():
urlobj = UrlComponents(row['subdomain'], row['domain'], row['tld'], row['path'], row['sid'], row['url'], row['rd'])
theclusters.addToCluster(urlobj, index)
####End Function
####BEGIN - start point of the program
if __name__ == '__main__':
main()