-
Notifications
You must be signed in to change notification settings - Fork 0
/
training_clean.R
558 lines (456 loc) · 32.9 KB
/
training_clean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
#Divided into two sections due to time-required
###################################################################
#Part 1:
###################################################################
remoteLogin("http://",diff = TRUE,session = TRUE,commandline = TRUE)
library(data.table)
library(dplyr)
library(mrsdeploy)
train<-fread("C:/Data/microsoft-malware-prediction/train.csv",
drop=c('Census_IsWIMBootEnabled','Census_IsFlightingInternal',
'Census_ThresholdOptIn','DefaultBrowsersIdentifier','SMode'))
#Imputing Mean
train[which(is.na(train$AVProductsInstalled)),'AVProductsInstalled']<-round(mean(na.exclude(train$AVProductsInstalled)),0)
train[which(is.na(train$AVProductsEnabled)),'AVProductsEnabled']<-round(mean(na.exclude(train$AVProductsEnabled)),0)
train[which(is.na(train$RtpStateBitfield)),'RtpStateBitfield']<-round(mean(na.exclude(train$RtpStateBitfield)),0)
train[which(is.na(train$Wdft_IsGamer)),'Wdft_IsGamer']<-round(mean(na.exclude(train$Wdft_IsGamer)),0)
train[which(is.na(train$Census_IsAlwaysOnAlwaysConnectedCapable)),'Census_IsAlwaysOnAlwaysConnectedCapable']<-round(mean(na.exclude(train$Census_IsAlwaysOnAlwaysConnectedCapable)),0)
train[which(is.na(train$Census_IsVirtualDevice)),'Census_IsVirtualDevice']<-round(mean(na.exclude(train$Census_IsVirtualDevice)),0)
train[which(is.na(train$Firewall)),'Firewall']<-round(mean(na.exclude(train$Firewall)),0)
train[which(is.na(train$IsProtected)),'IsProtected']<-round(mean(na.exclude(train$IsProtected)),0)
train[which(is.na(train$Census_IsFlightsDisabled)),'Census_IsFlightsDisabled']<-round(mean(na.exclude(train$Census_IsFlightsDisabled)),0)
#Imputing with Mode
train[which(is.na(train$Wdft_RegionIdentifier)),'Wdft_RegionIdentifier']<-10
train[which(is.na(train$GeoNameIdentifier)),'GeoNameIdentifier']<-277
train[which(is.na(train$UacLuaenable)),'UacLuaenable']<-1
train[which(is.na(train$AVProductStatesIdentifier)),'AVProductStatesIdentifier']<-53447
train[which(is.na(train$Census_TotalPhysicalRAM)),'Census_TotalPhysicalRAM']<-4096
train[which(is.na(train$OrganizationIdentifier)),'OrganizationIdentifier']<-27
#Census_ProcessorCoreCount apparently 4 is the standard amount of core processors...good to know
table(train[which(is.na(train$Census_ProcessorCoreCount)),'Census_MDC2FormFactor'])
train[which(is.na(train$Census_ProcessorCoreCount)&train$Census_MDC2FormFactor=="Notebook"),'Census_ProcessorCoreCount']<-4
train[which(is.na(train$Census_ProcessorCoreCount)&train$Census_MDC2FormFactor=="PCOther"),'Census_ProcessorCoreCount']<-4
train[which(is.na(train$Census_ProcessorCoreCount)&train$Census_MDC2FormFactor=="LargeTablet"),'Census_ProcessorCoreCount']<-4
train[which(is.na(train$Census_ProcessorCoreCount)&train$Census_MDC2FormFactor=="Desktop"),'Census_ProcessorCoreCount']<-4
train[which(is.na(train$Census_ProcessorCoreCount)&train$Census_MDC2FormFactor=="SmallTablet"),'Census_ProcessorCoreCount']<-4
train[which(is.na(train$Census_ProcessorCoreCount)&train$Census_MDC2FormFactor=="AllInOne"),'Census_ProcessorCoreCount']<-4
train[which(is.na(train$Census_ProcessorCoreCount)&train$Census_MDC2FormFactor=="Detachable"),'Census_ProcessorCoreCount']<-4
train[which(is.na(train$Census_ProcessorCoreCount)&train$Census_MDC2FormFactor=="Convertible"),'Census_ProcessorCoreCount']<-4
train[which(is.na(train$Census_ProcessorCoreCount)&train$Census_MDC2FormFactor=="ServerOther"),'Census_ProcessorCoreCount']<-4
df<-data.frame(levels(as.factor(train$Census_MDC2FormFactor)),seq(1:13),seq(1:13))
colnames(df)<-c("Census_MDC2FormFactor","Horizontal","Vertical")
for (i in 1:13){
df$Horizontal[i]<-round(mean(na.exclude(train$Census_InternalPrimaryDisplayResolutionHorizontal[train$Census_MDC2FormFactor== df$Census_MDC2FormFactor[i]])))
df$Vertical[i]<-round(mean(na.exclude(train$Census_InternalPrimaryDisplayResolutionVertical[train$Census_MDC2FormFactor== df$Census_MDC2FormFactor[i]])))
}
df
replace<-train[which(is.na(train$Census_InternalPrimaryDisplayResolutionVertical)),c('Census_MDC2FormFactor','Census_InternalPrimaryDisplayResolutionHorizontal','Census_InternalPrimaryDisplayResolutionVertical')]
#Originally written to be executed in one loop but that's too much for the R server to handle
for (i in 1:nrow(replace)){
replace$Census_InternalPrimaryDisplayResolutionHorizontal[i]<-df[df$Census_MDC2FormFactor==replace$Census_MDC2FormFactor[i],'Horizontal']
}
for (i in 1:nrow(replace)){
replace$Census_InternalPrimaryDisplayResolutionVertical[i]<-df[df$Census_MDC2FormFactor==replace$Census_MDC2FormFactor[i],'Vertical']
}
replace
replace<-replace[,-1]
train[which(is.na(train$Census_InternalPrimaryDisplayResolutionVertical)),c('Census_InternalPrimaryDisplayResolutionHorizontal','Census_InternalPrimaryDisplayResolutionVertical')]<-replace
#Similar plan of action for total capacity
df<-data.frame(levels(as.factor(train$Census_MDC2FormFactor)),seq(1:13),seq(1:13))
colnames(df)<-c("Census_MDC2FormFactor","Census_PrimaryDiskTotalCapacity","Census_SystemVolumeTotalCapacity")
for (i in 1:13){
df$Census_PrimaryDiskTotalCapacity[i]<-round(mean(na.exclude(train$Census_PrimaryDiskTotalCapacity[train$Census_MDC2FormFactor== df$Census_MDC2FormFactor[i]])))
df$Census_SystemVolumeTotalCapacity[i]<-round(mean(na.exclude(train$Census_SystemVolumeTotalCapacity[train$Census_MDC2FormFactor== df$Census_MDC2FormFactor[i]])))
}
df
df$Census_MDC2FormFactor<-as.character(df$Census_MDC2FormFactor)
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="AllInOne"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[1,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="Convertible"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[2,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="Desktop"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[3,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="Detachable"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[4,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="IoTOther"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[5,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="LargeServer"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[6,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="LargeTablet"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[7,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="MediumServer"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[8,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="Notebook"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[9,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="PCOther"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[10,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="ServerOther"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[11,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="SmallServer"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[12,2:3]
train[which(is.na(train$Census_SystemVolumeTotalCapacity)&train$Census_MDC2FormFactor=="SmallTablet"),c('Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity')]<-df[13,2:3]
train[which(is.na(train$Census_PrimaryDiskTotalCapacity)),'Census_MDC2FormFactor']
train[which(is.na(train$Census_PrimaryDiskTotalCapacity)&train$Census_MDC2FormFactor=="Desktop"),'Census_PrimaryDiskTotalCapacity']<-df[3,2]
train[which(is.na(train$Census_PrimaryDiskTotalCapacity)&train$Census_MDC2FormFactor=="Notebook"),'Census_PrimaryDiskTotalCapacity']<-df[9,2]
train[which(is.na(train$Census_PrimaryDiskTotalCapacity)&train$Census_MDC2FormFactor=="Convertible"),'Census_PrimaryDiskTotalCapacity']<-df[2,2]
#The same exact process for 2 other real value features
df<-data.frame(levels(as.factor(train$Census_MDC2FormFactor)),seq(1:13),seq(1:13))
colnames(df)<-c("Census_MDC2FormFactor","Census_InternalBatteryNumberOfCharges","Census_InternalPrimaryDiagonalDisplaySizeInInches")
for (i in 1:13){
df$Census_InternalBatteryNumberOfCharges[i]<-round(mean(na.exclude(train$Census_InternalBatteryNumberOfCharges[train$Census_MDC2FormFactor== df$Census_MDC2FormFactor[i]])))
df$Census_InternalPrimaryDiagonalDisplaySizeInInches[i]<-round(mean(na.exclude(train$Census_InternalPrimaryDiagonalDisplaySizeInInches[train$Census_MDC2FormFactor== df$Census_MDC2FormFactor[i]])))
}
df
df$Census_MDC2FormFactor<-as.character(df$Census_MDC2FormFactor)
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="AllInOne"),'Census_InternalBatteryNumberOfCharges']<-df[1,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="Convertible"),'Census_InternalBatteryNumberOfCharges']<-df[2,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="Desktop"),'Census_InternalBatteryNumberOfCharges']<-df[3,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="Detachable"),'Census_InternalBatteryNumberOfCharges']<-df[4,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="IoTOther"),'Census_InternalBatteryNumberOfCharges']<-df[5,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="LargeServer"),'Census_InternalBatteryNumberOfCharges']<-df[6,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="LargeTablet"),'Census_InternalBatteryNumberOfCharges']<-df[7,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="MediumServer"),'Census_InternalBatteryNumberOfCharges']<-df[8,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="Notebook"),'Census_InternalBatteryNumberOfCharges']<-df[9,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="PCOther"),'Census_InternalBatteryNumberOfCharges']<-df[10,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="ServerOther"),'Census_InternalBatteryNumberOfCharges']<-df[11,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="SmallServer"),'Census_InternalBatteryNumberOfCharges']<-df[12,2]
train[which(is.na(train$Census_InternalBatteryNumberOfCharges)&train$Census_MDC2FormFactor=="SmallTablet"),'Census_InternalBatteryNumberOfCharges']<-df[13,2]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="AllInOne"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[1,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="Convertible"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[2,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="Desktop"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[3,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="Detachable"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[4,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="IoTOther"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[5,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="LargeServer"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[6,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="LargeTablet"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[7,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="MediumServer"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[8,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="Notebook"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[9,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="PCOther"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[10,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="ServerOther"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[11,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="SmallServer"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[12,3]
train[which(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches)&train$Census_MDC2FormFactor=="SmallTablet"),'Census_InternalPrimaryDiagonalDisplaySizeInInches']<-df[13,3]
df<-data.frame(levels(as.factor(train$CountryIdentifier)),seq(1:222))
colnames(df)<-c("CountryIdentifier","MostPopularCity")
df$CountryIdentifier<-as.numeric(df$CountryIdentifier)
getmode <- function(x) {
uniqx <- unique(x)
uniqx[which.max(tabulate(match(x, uniqx)))]
}
for (i in 1:222){
df$MostPopularCity[i]<-getmode(na.exclude(train$CityIdentifier[train$CountryIdentifier== df$CountryIdentifier[i]]))
}
df<-distinct(df,CountryIdentifier,.keep_all=T)
df<-df[order(df$CountryIdentifier),]
#Have to put in different loops because combining them crashes the server
for (i in 1:50){
train[which(is.na(train$CityIdentifier)&train$CountryIdentifier==df$CountryIdentifier[i]),'CityIdentifier']<-df$MostPopularCity[i]
print(i)
}
for (i in 51:100){
train[which(is.na(train$CityIdentifier)&train$CountryIdentifier==df$CountryIdentifier[i]),'CityIdentifier']<-df$MostPopularCity[i]
print(i)
}
for (i in 101:150){
train[which(is.na(train$CityIdentifier)&train$CountryIdentifier==df$CountryIdentifier[i]),'CityIdentifier']<-df$MostPopularCity[i]
print(i)
}
for (i in 151:200){
train[which(is.na(train$CityIdentifier)&train$CountryIdentifier==df$CountryIdentifier[i]),'CityIdentifier']<-df$MostPopularCity[i]
print(i)
}
for (i in 201:222){
train[which(is.na(train$CityIdentifier)&train$CountryIdentifier==df$CountryIdentifier[i]),'CityIdentifier']<-df$MostPopularCity[i]
print(i)
}
#Recode with mode
getmode(na.exclude(train$Census_ProcessorModelIdentifier))
sum(na.exclude(train$Census_ProcessorModelIdentifier==2697))
train[which(is.na(train$Census_ProcessorModelIdentifier)),'Census_ProcessorModelIdentifier']<-2697
getmode(na.exclude(train$Census_ProcessorManufacturerIdentifier))
sum(na.exclude(train$Census_ProcessorManufacturerIdentifier==5))
train[which(is.na(train$Census_ProcessorManufacturerIdentifier)),'Census_ProcessorManufacturerIdentifier']<-5
getmode(na.exclude(train$IeVerIdentifier))
sum(na.exclude(train$IeVerIdentifier==137))
train[which(is.na(train$IeVerIdentifier)),'IeVerIdentifier']<-137
getmode(na.exclude(train$Census_OSInstallLanguageIdentifier))
sum(na.exclude(train$Census_OSInstallLanguageIdentifier==8))
train[which(is.na(train$Census_OSInstallLanguageIdentifier)),'Census_OSInstallLanguageIdentifier']<-8
getmode(na.exclude(train$Census_OEMNameIdentifier))
sum(na.exclude(train$Census_OEMNameIdentifier==2668))
train[which(is.na(train$Census_OEMNameIdentifier)),'Census_OEMNameIdentifier']<-2668
getmode(na.exclude(train$Census_OEMModelIdentifier))
sum(na.exclude(train$Census_OEMModelIdentifier==313586))
train[which(is.na(train$Census_OEMModelIdentifier)),'Census_OEMModelIdentifier']<-313586
getmode(na.exclude(train$Census_FirmwareVersionIdentifier))
sum(na.exclude(train$Census_FirmwareVersionIdentifier==33105))
train[which(is.na(train$Census_FirmwareVersionIdentifier)),'Census_FirmwareVersionIdentifier']<-33105
getmode(na.exclude(train$Census_FirmwareManufacturerIdentifier))
sum(na.exclude(train$Census_FirmwareManufacturerIdentifier==142))
train[which(is.na(train$Census_FirmwareManufacturerIdentifier)),'Census_FirmwareManufacturerIdentifier']<-142
write.csv(train,"C:/Data/microsoft-malware-prediction/cleaned_train.csv",row.names = F)
###################################################################
#Part 2:
###################################################################
remoteLogin("http://",diff = TRUE,session = TRUE,commandline = TRUE)
library(data.table)
library(dplyr)
library(mrsdeploy)
train<-fread("C:/Data/microsoft-malware-prediction/cleaned_train.csv",
drop=c('OsBuildLab','Census_PowerPlatformRoleName','OsPlatformSubRelease',
'Census_InternalBatteryType','SkuEdition','Census_OSArchitecture',
'Census_ChassisTypeName','Census_OSSkuName','Census_DeviceFamily',
'Census_ProcessorClass'))
#Get rid of OsBuildLab as everything is in the other columns
#Get rid of internal battery type as the variables make no sense
#Get rid of Census_PowerPlatformRoleName as all information is preserved in my fav field
#Get rid of OsPlatformSubRelease as we're just using main versioning info
#Get rid of SkuEdition as osskuname has all the same info
#get rid of Census_OSArchitecture as all the same info is in Processor
#get rid of Census_ChassisTypeName as all the same info is in Census_MDC2FormFactor
#removing Census_OSSkuName because it is also redundant to other fields
#removing Census_DeviceFamily because all of them are Windows and granularity preserved
#in other fields
#table(train$Census_ProcessorClass)
#Almost all NAs, removing because it's pointless
#Now that NAs are removed, character vectors need to be dealt with
#levels(as.factor(train$PuaMode))
train[train$PuaMode=="",'PuaMode']<-"0"
train[train$PuaMode=="audit",'PuaMode']<-"0.5"
train[train$PuaMode=="on",'PuaMode']<-"1"
#levels(as.factor(train$Census_OSVersion))
train$Census_OSVersion<-gsub("[[:punct:]]0[[:punct:]]","",train$Census_OSVersion)
train$Census_OSVersion<-gsub("[[:punct:]]","",train$Census_OSVersion)
train$Census_OSVersion<-substring(train$Census_OSVersion,1,5)
train[train$Census_OSVersion=="61760",'Census_OSVersion']<-"06176"
train[train$Census_OSVersion=="62920",'Census_OSVersion']<-"06292"
train[train$Census_OSVersion=="63960",'Census_OSVersion']<-"06396"
#Make a new feature for if SmartScreen= ExistsNotSet 1 y 0 n
#has an 80% detection rate
train$ExistsNotSet<-NA
train[train$SmartScreen=="ExistsNotSet",'ExistsNotSet']<-1
train[train$SmartScreen!="ExistsNotSet",'ExistsNotSet']<-0
#levels(as.factor(train$Platform))
train[which(train$Platform=="windows10"),'Platform']<-10
train[which(train$Platform=="windows7"),'Platform']<-7
train[which(train$Platform=="windows8"),'Platform']<-8
train[which(train$Platform=="windows2016"),'Platform']<-16
#levels(as.factor(train$Processor))
train[which(train$Processor=="x64"|train$Processor=="arm64"),'Processor']<-64
train[which(train$Processor=="x86"),'Processor']<-86
#levels(as.factor(train$AvSigVersion))
train$AvSigVersion<-gsub("1[[:punct:]]","",train$AvSigVersion)
train$AvSigVersion<-gsub("[[:punct:]][[:digit:]]","",train$AvSigVersion)
train[which(train$AvSigVersion=="0"),'AvSigVersion']<-"200"
train$AvSigVersion<-substring(train$AvSigVersion,1,3)
#levels(as.factor(train$OsVer))
train$OsVer<-gsub("[[:punct:]]","",train$OsVer)
train$OsVer<-substring(train$OsVer,1,2)
train$OsVer<-as.numeric(train$OsVer)
train[train$OsVer>50,'OsVer']<-6
#levels(as.factor(train$EngineVersion))
train$EngineVersion<-gsub("1[[:punct:]]","",train$EngineVersion)
train$EngineVersion<-gsub("[[:punct:]]","",train$EngineVersion)
train$EngineVersion<-substring(train$EngineVersion,1,3)
#levels(as.factor(train$AppVersion))
train$AppVersion<-gsub("[[:punct:]]","",train$AppVersion)
train$AppVersion<-substring(train$AppVersion,1,3)
#levels(as.factor(train$SmartScreen))
train[train$SmartScreen=="",'SmartScreen']<-"RequireAdmin"
train[train$SmartScreen %in% c("off","OFF"),'SmartScreen']<-"Off"
train[train$SmartScreen %in% c("Enabled","on"),'SmartScreen']<-"On"
train[train$SmartScreen=="warn",'SmartScreen']<-"Warn"
train[train$SmartScreen %in% c("prompt","Promt"),'SmartScreen']<-"Prompt"
train[train$SmartScreen %in% c("requireadmin",'requireAdmin'),'SmartScreen']<-"RequireAdmin"
#Setting all as ExistsNotSet as these are all replacement smart screens
train[train$SmartScreen %in% c("00000000","","","","0"),'SmartScreen']<-"ExistsNotSet"
#df<-data.frame(levels(as.factor(train$SmartScreen)),seq(1:7))
#colnames(df)<-c("SmartScreen","DetectionRate")
#df$SmartScreen<-as.character(df$SmartScreen)
#for (i in 1:7){
#df$DetectionRate[i]<-round(((sum(train$HasDetections==1&train$SmartScreen==df$SmartScreen[i]))/sum(train$SmartScreen==df$SmartScreen[i])*100),3)
#}
#df
#Highest detection Rate = 1, going down to n
train[train$SmartScreen=="ExistsNotSet",'SmartScreen']<-"1"
train[train$SmartScreen=="Warn",'SmartScreen']<-"2"
train[train$SmartScreen=="On",'SmartScreen']<-"3"
train[train$SmartScreen=="Block",'SmartScreen']<-"4"
train[train$SmartScreen=="Off",'SmartScreen']<-"5"
train[train$SmartScreen=="Prompt",'SmartScreen']<-"6"
train[train$SmartScreen=="RequireAdmin",'SmartScreen']<-"7"
#table(train$Census_MDC2FormFactor)
'%ni%' <- Negate('%in%')
#df<-data.frame(levels(as.factor(train$Census_MDC2FormFactor)),seq(1:13))
#colnames(df)<-c("Census_MDC2FormFactor","DetectionRate")
#df$Census_MDC2FormFactor<-as.character(df$Census_MDC2FormFactor)
#for (i in 1:13){
#df$DetectionRate[i]<-round(((sum(train$HasDetections==1&train$Census_MDC2FormFactor==df$Census_MDC2FormFactor[i]))/sum(train$Census_MDC2FormFactor==df$Census_MDC2FormFactor[i])*100),3)
#}
#df
train$IOT<-NA
#Notebook/IOTOther, LargeTablet, Notebook, SmallTablet
train[train$Census_MDC2FormFactor %in% c("Notebook","IOTOther"),'IOT']<-3
train[train$Census_MDC2FormFactor=="LargeTablet",'IOT']<-2
train[train$Census_MDC2FormFactor=="SmallTablet",'IOT']<-1
train[train$Census_MDC2FormFactor %ni% c("Notebook","LargeTablet","SmallTablet","IOTOther"),'IOT']<-0
train$PC<-NA
#Desktop/PCOther,AllInOne,Convertible,Detachable
train[train$Census_MDC2FormFactor %in% c("Desktop","PCOther"),'PC']<-4
train[train$Census_MDC2FormFactor=="AllInOne",'PC']<-3
train[train$Census_MDC2FormFactor=="Convertible",'PC']<-2
train[train$Census_MDC2FormFactor=="Detachable",'PC']<-1
train[train$Census_MDC2FormFactor %ni% c("Desktop","AllInOne","Convertible","Detachable","PCOther"),'PC']<-0
train$Server<-NA
#LargeServer,MediumServer,SmallServer,ServerOther
train[train$Census_MDC2FormFactor %in% c("SmallServer","ServerOther"),'Server']<-3
train[train$Census_MDC2FormFactor=="MediumServer",'Server']<-2
train[train$Census_MDC2FormFactor=="LargeServer",'Server']<-1
train[train$Census_MDC2FormFactor %ni% c("LargeServer","MediumServer","SmallServer","ServerOther"),'Server']<-0
train<-train[,-30]
#levels(as.factor(train$Census_OSBranch))
train[which(startsWith(train$Census_OSBranch,"rs")),'Census_OSBranch']<-"rs"
train[which(startsWith(train$Census_OSBranch,"th")),'Census_OSBranch']<-"th"
train[which(startsWith(train$Census_OSBranch,"win")),'Census_OSBranch']<-"win"
train[train$Census_OSBranch=="Khmer OS",'Census_OSBranch']<-"rs"
#df<-data.frame(levels(as.factor(train$Census_OSBranch)),seq(1:3))
#colnames(df)<-c("Census_OSBranch","DetectionRate")
#df$Census_OSBranch<-as.character(df$Census_OSBranch)
#for (i in 1:3){
#df$DetectionRate[i]<-round(((sum(train$HasDetections==1&train$Census_OSBranch==df$Census_OSBranch[i]))/sum(train$Census_OSBranch==df$Census_OSBranch[i])*100),3)
#}
#df
train[train$Census_OSBranch=="rs",'Census_OSBranch']<-1
train[train$Census_OSBranch=="th",'Census_OSBranch']<-2
train[train$Census_OSBranch=="win",'Census_OSBranch']<-3
#levels(as.factor(train$Census_OSEdition))
train[which(startsWith(train$Census_OSEdition,"Clo")),'Census_OSEdition']<-"Cloud"
train[which(startsWith(train$Census_OSEdition,"Core")),'Census_OSEdition']<-"Core"
train[which(startsWith(train$Census_OSEdition,"Education")),'Census_OSEdition']<-"Education"
train[which(startsWith(train$Census_OSEdition,"Home")),'Census_OSEdition']<-"Home"
train[which(startsWith(train$Census_OSEdition,"Pro")),'Census_OSEdition']<-"Professional"
train[which(startsWith(train$Census_OSEdition,"pro")),'Census_OSEdition']<-"Professional"
train[which(startsWith(train$Census_OSEdition,"Ent")),'Census_OSEdition']<-"Enterprise"
train[which(startsWith(train$Census_OSEdition,"Ser")),'Census_OSEdition']<-"Server"
train[train$Census_OSEdition=="Ultimate",'Census_OSEdition']<-"Enterprise"
#df<-data.frame(levels(as.factor(train$Census_OSEdition)),seq(1:7))
#colnames(df)<-c("Census_OSEdition","DetectionRate")
#df$Census_OSEdition<-as.character(df$Census_OSEdition)
#for (i in 1:7){
#df$DetectionRate[i]<-round(((sum(train$HasDetections==1&train$Census_OSEdition==df$Census_OSEdition[i]))/sum(train$Census_OSEdition==df$Census_OSEdition[i])*100),3)
#}
#df
train[train$Census_OSEdition=="Education",'Census_OSEdition']<-1
train[train$Census_OSEdition=="Enterprise",'Census_OSEdition']<-2
train[train$Census_OSEdition=="Professional",'Census_OSEdition']<-3
train[train$Census_OSEdition=="Home",'Census_OSEdition']<-4
train[train$Census_OSEdition=="Core",'Census_OSEdition']<-5
train[train$Census_OSEdition=="Cloud",'Census_OSEdition']<-6
train[train$Census_OSEdition=="Server",'Census_OSEdition']<-7
#levels(as.factor(train$Census_ActivationChannel))
train[which(startsWith(train$Census_ActivationChannel,"OEM")),'Census_ActivationChannel']<-"OEM"
train[which(startsWith(train$Census_ActivationChannel,"Vol")),'Census_ActivationChannel']<-"Volume"
train[which(startsWith(train$Census_ActivationChannel,"Ret")),'Census_ActivationChannel']<-"Retail"
#df<-data.frame(levels(as.factor(train$Census_ActivationChannel)),seq(1:3))
#colnames(df)<-c("Census_ActivationChannel","DetectionRate")
#df$Census_ActivationChannel<-as.character(df$Census_ActivationChannel)
#for (i in 1:3){
#df$DetectionRate[i]<-round(((sum(train$HasDetections==1&train$Census_ActivationChannel==df$Census_ActivationChannel[i]))/sum(train$Census_ActivationChannel==df$Census_ActivationChannel[i])*100),3)
#}
#df
train[train$Census_ActivationChannel=="Volume",'Census_ActivationChannel']<-1
train[train$Census_ActivationChannel=="OEM",'Census_ActivationChannel']<-2
train[train$Census_ActivationChannel=="Retail",'Census_ActivationChannel']<-3
#levels(as.factor(train$Census_FlightRing))
train[which(train$Census_FlightRing %in% c("Canary","Invalid","OSG")),'Census_FlightRing']<-"Unknown"
train[which(train$Census_FlightRing %in% c("Disabled","NOT_SET")),'Census_FlightRing']<-"No"
train[which(train$Census_FlightRing %in% c("Retail","RP")),'Census_FlightRing']<-"Retail"
train[which(train$Census_FlightRing %in% c("WIF","WIS")),'Census_FlightRing']<-"W"
#df<-data.frame(levels(as.factor(train$Census_FlightRing)),seq(1:4))
#colnames(df)<-c("Census_FlightRing","DetectionRate")
#df$Census_FlightRing<-as.character(df$Census_FlightRing)
#for (i in 1:4){
#df$DetectionRate[i]<-round(((sum(train$HasDetections==1&train$Census_FlightRing==df$Census_FlightRing[i]))/sum(train$Census_FlightRing==df$Census_FlightRing[i])*100),3)
#}
#df
train[train$Census_FlightRing=="Retail",'Census_FlightRing']<-1
train[train$Census_FlightRing=="No",'Census_FlightRing']<-2
train[train$Census_FlightRing=="Unknown",'Census_FlightRing']<-3
train[train$Census_FlightRing=="W",'Census_FlightRing']<-4
#levels(as.factor(train$Census_GenuineStateName))
#table(train$Census_GenuineStateName)
train[train$Census_GenuineStateName=="TAMPERED",'Census_GenuineStateName']<-"INVALID_LICENSE"
#df<-data.frame(levels(as.factor(train$Census_GenuineStateName)),seq(1:4))
#colnames(df)<-c("Census_GenuineStateName","DetectionRate")
#df$Census_GenuineStateName<-as.character(df$Census_GenuineStateName)
#for (i in 1:4){
#df$DetectionRate[i]<-round(((sum(train$HasDetections==1&train$Census_GenuineStateName==df$Census_GenuineStateName[i]))/sum(train$Census_GenuineStateName==df$Census_GenuineStateName[i])*100),3)
#}
#df
train[train$Census_GenuineStateName=="OFFLINE",'Census_GenuineStateName']<-1
train[train$Census_GenuineStateName=="IS_GENUINE",'Census_GenuineStateName']<-2
train[train$Census_GenuineStateName=="INVALID_LICENSE",'Census_GenuineStateName']<-3
train[train$Census_GenuineStateName=="UNKNOWN",'Census_GenuineStateName']<-4
#levels(as.factor(train$Census_OSWUAutoUpdateOptionsName))
#df<-data.frame(levels(as.factor(train$Census_OSWUAutoUpdateOptionsName)),seq(1:6))
#colnames(df)<-c("Census_OSWUAutoUpdateOptionsName","DetectionRate")
#df$Census_OSWUAutoUpdateOptionsName<-as.character(df$Census_OSWUAutoUpdateOptionsName)
#for (i in 1:6){
#df$DetectionRate[i]<-round(((sum(train$HasDetections==1&train$Census_OSWUAutoUpdateOptionsName==df$Census_OSWUAutoUpdateOptionsName[i]))/sum(train$Census_OSWUAutoUpdateOptionsName==df$Census_OSWUAutoUpdateOptionsName[i])*100),3)
#}
#df
train[train$Census_OSWUAutoUpdateOptionsName=="FullAuto",'Census_OSWUAutoUpdateOptionsName']<-1
train[train$Census_OSWUAutoUpdateOptionsName=="UNKNOWN",'Census_OSWUAutoUpdateOptionsName']<-2
train[train$Census_OSWUAutoUpdateOptionsName=="AutoInstallAndRebootAtMaintenanceTime",'Census_OSWUAutoUpdateOptionsName']<-3
train[train$Census_OSWUAutoUpdateOptionsName=="Off",'Census_OSWUAutoUpdateOptionsName']<-4
train[train$Census_OSWUAutoUpdateOptionsName=="Notify",'Census_OSWUAutoUpdateOptionsName']<-5
train[train$Census_OSWUAutoUpdateOptionsName=="DownloadNotify",'Census_OSWUAutoUpdateOptionsName']<-6
#levels(as.factor(train$Census_PrimaryDiskTypeName))
train[train$Census_PrimaryDiskTypeName=="",'Census_PrimaryDiskTypeName']<-"HDD"
#df<-data.frame(levels(as.factor(train$Census_PrimaryDiskTypeName)),seq(1:4))
#colnames(df)<-c("Census_PrimaryDiskTypeName","DetectionRate")
#df$Census_PrimaryDiskTypeName<-as.character(df$Census_PrimaryDiskTypeName)
#for (i in 1:4){
#df$DetectionRate[i]<-round(((sum(train$HasDetections==1&train$Census_PrimaryDiskTypeName==df$Census_PrimaryDiskTypeName[i]))/sum(train$Census_PrimaryDiskTypeName==df$Census_PrimaryDiskTypeName[i])*100),3)
#}
#df
train[train$Census_PrimaryDiskTypeName=="HDD",'Census_PrimaryDiskTypeName']<-1
train[train$Census_PrimaryDiskTypeName=="SSD",'Census_PrimaryDiskTypeName']<-2
train[train$Census_PrimaryDiskTypeName=="UNKNOWN",'Census_PrimaryDiskTypeName']<-3
train[train$Census_PrimaryDiskTypeName=="Unspecified",'Census_PrimaryDiskTypeName']<-4
#ProductName
#levels(as.factor(train$ProductName))
train[which(startsWith(train$ProductName,"ms")),'ProductName']<-"mse"
train[which(startsWith(train$ProductName,"win")),'ProductName']<-"windows"
#df<-data.frame(levels(as.factor(train$ProductName)),seq(1:4))
#colnames(df)<-c("ProductName","DetectionRate")
#df$ProductName<-as.character(df$ProductName)
#for (i in 1:4){
#df$DetectionRate[i]<-round(((sum(train$HasDetections==1&train$ProductName==df$ProductName[i]))/sum(train$ProductName==df$ProductName[i])*100),3)
#}
#df
train[train$ProductName=="windows",'ProductName']<-1
train[train$ProductName=="mse",'ProductName']<-2
train[train$ProductName=="scep",'ProductName']<-3
train[train$ProductName=="fep",'ProductName']<-4
#Census_OSInstallTypeName
#levels(as.factor(train$Census_OSInstallTypeName))
train[which(startsWith(train$Census_OSInstallTypeName,"Cle")),'Census_OSInstallTypeName']<-"Clean"
train[which(train$Census_OSInstallTypeName=="IBSClean"),'Census_OSInstallTypeName']<-"Clean"
train[which(endsWith(train$Census_OSInstallTypeName,"rade")),'Census_OSInstallTypeName']<-"Upgrade"
#df<-data.frame(levels(as.factor(train$Census_OSInstallTypeName)),seq(1:6))
#colnames(df)<-c("Census_OSInstallTypeName","DetectionRate")
#df$Census_OSInstallTypeName<-as.character(df$Census_OSInstallTypeName)
#for (i in 1:6){
#df$DetectionRate[i]<-round(((sum(train$HasDetections==1&train$Census_OSInstallTypeName==df$Census_OSInstallTypeName[i]))/sum(train$Census_OSInstallTypeName==df$Census_OSInstallTypeName[i])*100),3)
#}
#df
train[train$Census_OSInstallTypeName=="Clean",'Census_OSInstallTypeName']<-1
train[train$Census_OSInstallTypeName=="Upgrade",'Census_OSInstallTypeName']<-2
train[train$Census_OSInstallTypeName=="Other",'Census_OSInstallTypeName']<-3
train[train$Census_OSInstallTypeName=="Reset",'Census_OSInstallTypeName']<-4
train[train$Census_OSInstallTypeName=="Update",'Census_OSInstallTypeName']<-5
train[train$Census_OSInstallTypeName=="Refresh",'Census_OSInstallTypeName']<-6
str(train)
train$MachineIdentifier<-as.character(train$MachineIdentifier)
train$AvSigVersion<-as.character(train$AvSigVersion)
train[train$AvSigVersion=="2&#",'AvSigVersion']<-"NA"
train$AvSigVersion<-as.numeric(train$AvSigVersion)
train[which(is.na(train$AvSigVersion)),'AvSigVersion']<-mean(train$AvSigVersion,na.rm=T)
train$ExistsNotSet<-as.numeric(train$ExistsNotSet)
train$IOT<-as.numeric(train$IOT)
train$PC<-as.numeric(train$PC)
train$Server<-as.numeric(train$Server)
na_count<-data.frame(colnames(train),as.numeric(colSums(is.na(train))))
colnames(na_count)<-c("Feature","NAs")
na_count<-na_count[order(na_count$NAs,decreasing = T),]
na_count<-na_count%>%filter(NAs>0)
na_count
write.csv(train,"C:/Data/microsoft-malware-prediction/cleaned_train_FINAL.csv",row.names = F)
exit