modeling.R

remoteLogin("http://",diff = TRUE,session = TRUE,commandline = TRUE)
library(data.table)
library(feather)
library(dplyr)
library(MASS)
library(pROC)
library(randomForest)
library(xgboost)
library(e1071)
library(tree)
library(ISLR)
library(Metrics)
library(mlbench)
library(gbm) 
library(ggplot2)
library(caret)
library(naivebayes)

validate_train<-read_feather("C:/Data/microsoft-malware-prediction/train.feather")
#Drop the machine identifier because it's only needed for the test for submitting to kaggle
validate_train<-validate_train[,-1]
#Drop Census_InternalBatteryNumberOfCharges as it's causing too many problems
validate_train<-validate_train[,-42]

#Split to train and test at 70-30
set.seed(52)
validate_train_ind <- sample(seq_len(nrow(validate_train)), size = floor(0.7 * nrow(validate_train)))
validate_test <- validate_train[-validate_train_ind, ]
validate_train <- validate_train[validate_train_ind, ]
rm(validate_train_ind)

##########################################################################################

#Generate correlation plot
correlation<-cor(validate_train)
correlation<-round(correlation,3)
pause()
getRemoteObject("correlation")
library(corrplot)
corrplot(correlation, method="circle",tl.cex=.5)
resume()

#Model 1: Logistic
glm_model <- glm(HasDetections~.,data=validate_train,family="binomial")
glm_fitted<-suppressWarnings(predict(glm_model,newdata=validate_test,type="response"))

#Calculate AUC, Accuracy, and False Negative %s
ROC_logistic <- roc(validate_test$HasDetections,glm_fitted)
plot(ROC_logistic, col = "red")
pROC::auc(ROC_logistic)

#Convert to 0s and 1s for confusion matrix
glm_fitted[glm_fitted>=0.5]<-1
glm_fitted[glm_fitted<0.5]<-0

t<-table(true=validate_test$HasDetections, glm_fitted)
t
accuracy<-t[2,2]/(t[2,1]+t[2,2])
false_negative<-t[2,1]/(t[2,1]+t[2,2])
accuracy
false_negative

#Use stepwise model to optimize
step <- stepAIC(glm_model, direction="both")
step$anova
step_both<-glm(HasDetections ~ ProductName + EngineVersion + AppVersion + AvSigVersion + 
                 RtpStateBitfield + IsSxsPassiveMode + AVProductStatesIdentifier + 
                 AVProductsInstalled + AVProductsEnabled + HasTpm + CityIdentifier + 
                 LocaleEnglishNameIdentifier + Platform + Processor + OsVer + 
                 IsProtected + AutoSampleOptIn + IeVerIdentifier + SmartScreen + 
                 Firewall + Census_OEMNameIdentifier + Census_ProcessorCoreCount + 
                 Census_ProcessorModelIdentifier + Census_PrimaryDiskTotalCapacity + 
                 Census_SystemVolumeTotalCapacity + Census_HasOpticalDiskDrive + 
                 Census_TotalPhysicalRAM + Census_InternalPrimaryDisplayResolutionHorizontal + 
                 Census_InternalPrimaryDisplayResolutionVertical + Census_OSVersion + 
                 Census_OSWUAutoUpdateOptionsName + Census_GenuineStateName + 
                 Census_ActivationChannel + Census_IsFlightsDisabled + Census_FlightRing + 
                 Census_FirmwareVersionIdentifier + Census_IsSecureBootEnabled + 
                 Census_IsVirtualDevice + Census_IsTouchEnabled + 
                 Census_IsAlwaysOnAlwaysConnectedCapable + Wdft_IsGamer + 
                 Wdft_RegionIdentifier + ExistsNotSet + IOT + PC,data=validate_train)
glm_fitted2<-suppressWarnings(predict(step_both,newdata=validate_test,type="response"))

#Calculate stepwise model's AUC, Accuracy, and False Negative %s
ROC_logistic2 <- roc(validate_test$HasDetections,glm_fitted2)
plot(ROC_logistic2, col = "red")
pROC::auc(ROC_logistic2)

glm_fitted2[glm_fitted2>=0.5]<-1
glm_fitted2[glm_fitted2<0.5]<-0

t<-table(true=validate_test$HasDetections, glm_fitted2)
t
accuracy<-t[2,2]/(t[2,1]+t[2,2])
false_negative<-t[2,1]/(t[2,1]+t[2,2])
accuracy
false_negative

##########################################################################################

#Model 2: SVM

#tune the best paramaters for svm
cv.svm<-tune(svm,HasDetections~.,data=validate_train)
cv.svm$best.model

#Apply best model to test set
svm_model<-svm(HasDetections ~., data=validate_train, type="C-classification",kernel="radial",epsilon=0.1,gamma=0.01515152,cost=1)
summary(svm_model)

svmfit <- predict(object=svm_model, validate_test[,-65], type='prob')

#Convert factored output numeric 0s and 1s
svmfit<-as.numeric(svmfit)
svmfit[svmfit==1]<-0
svmfit[svmfit==2]<-1

#Calculate AUC, Accuracy, and False Negative %s
ROC_svm <- roc(validate_test$HasDetections,svmfit)
plot(ROC_svm, col = "purple")
pROC::auc(ROC_svm)

t<-table(true=validate_test$HasDetections, svmfit)
t
accuracy<-t[2,2]/(t[2,1]+t[2,2])
false_negative<-t[2,1]/(t[2,1]+t[2,2])
accuracy
false_negative

##########################################################################################

#Model 3: Gradient Boosting

#Isolate Has Detections into its own y, factored by malware or not matlware
y<-as.factor(validate_train$HasDetections)
levels(y)<-c("no_malware","malware")
#Create a matrix of features
x<-data.matrix(validate_train[,-65])

#Tune the optimal model using 5 k folds & ROC as cost metric
cv_gbm<-caret::train(x,y,method='gbm',metric='ROC',
                     trControl=trainControl(method='cv', number=5, returnResamp='none', summaryFunction = twoClassSummary, classProbs = TRUE))
#Examine tuning results
summary(cv_gbm)
cv_gbm$bestTune

#Apply best tune to validation test set
gbfit <- predict(object=cv_gbm, validate_test[,-65], type='prob')

#Use the malware probability to calculate AUC
gbfit<-gbfit[,2]
ROC_gbm <- roc(validate_test$HasDetections,gbfit)
plot(ROC_gbm, col = "blue")
pROC::auc(ROC_gbm)

#Use malware probability as binary classes to calculate Accuracy and False Negative %s
gbfit[gbfit>=0.5]<-1
gbfit[gbfit<0.5]<-0

t<-table(true=validate_test$HasDetections, gbfit)
t
accuracy<-t[2,2]/(t[2,1]+t[2,2])
false_negative<-t[2,1]/(t[2,1]+t[2,2])
accuracy
false_negative

##########################################################################################

#Model 4: Naive Bayes

#Tune the optimal model using 5 k folds & ROC as cost metric
cv_nb<-caret::train(x,y,method='naive_bayes',metric='ROC',
                    trControl=trainControl(method='cv', number=5, returnResamp='none', summaryFunction = twoClassSummary, classProbs = TRUE))
#Examine tuning results
summary(cv_nb)
cv_nb$bestTune

#Apply best Naive Bayes model to test set
nbfit <- predict(object=cv_nb, validate_test[,-65], type='prob')

#Use the malware probability to calculate AUC
nbfit<-nbfit[,2]
ROC_nb <- roc(validate_test$HasDetections,nbfit)
plot(ROC_nb, col = "green")
pROC::auc(ROC_nb)

#Use malware probability as binary classes to calculate Accuracy and False Negative %s
nbfit[nbfit>=0.5]<-1
nbfit[nbfit<0.5]<-0

t<-table(true=validate_test$HasDetections, nbfit)
t
accuracy<-t[2,2]/(t[2,1]+t[2,2])
false_negative<-t[2,1]/(t[2,1]+t[2,2])
accuracy
false_negative

##########################################################################################

#Model 5: Random Forest

#Tune the optimal mtry value using 100 trees per iteration and continuing until the error
#doesn't increase by .1%
bestrf<-tuneRF(x,y,stepFactor = 1,improve=0.001,ntree=100,trace=TRUE,plot=TRUE)
print(bestrf)
#Best mtry=8, OOBError=0.35832

#Apply the best mtry to random forest model
bag.windows<-randomForest(as.factor(HasDetections)~.,mtry=8,validate_train,importance=TRUE)
bag.windows$importance
rffit<-predict(bag.windows,mtry=8,newdata=validate_test)

#Convert factored predictors to numeric 0s and 1s
rffit<-as.numeric(rffit)
rffit[which(rffit==1)]<-0
rffit[which(rffit==2)]<-1

#Calculate auc, accuracy, and False Negative %s
roc_rforest = roc(validate_test$HasDetections,rffit)
plot(roc_rforest)
pROC::auc(roc_rforest)

t<-table(true=validate_test$HasDetections, rffit)
t
accuracy<-t[2,2]/(t[2,1]+t[2,2])
false_negative<-t[2,1]/(t[2,1]+t[2,2])
accuracy
false_negative

##########################################################################################
#Overlay all the ROC plots
ggroc(list(gbm=ROC_gbm, logit=ROC_logistic, nbayes=ROC_nb,RForest=roc_rforest,svm=ROC_svm))+ 
  scale_color_manual(values=c("blue","red","green","black","purple"),name="Model")

#Develop Ensemble Solution of 2 best models

glm_fitted<-suppressWarnings(predict(glm_model,newdata=validate_test,type="response"))
gbfit <- predict(object=cv_gbm, validate_test[,-65], type='prob')
gbfit<-gbfit[,2]

pred_auc<-0
ensemble_ROCs<-list(ROC_gbm)
for (i in 0:10){
  replace<-((1-(i/10))*gbfit)+((i/10)*glm_fitted)
  roc_ensemble<-roc(validate_test$HasDetections,replace)
  pred_auc[i]<-pROC::auc(roc_ensemble)
  print(pred_auc)
}

max(pred_auc)

#Convert to 0s and 1s for confusion matrix
pred_auc2<-pred_auc[pred_auc>=0.5]<-1
pred_auc2<-pred_auc[pred_auc<0.5]<-0

t<-table(true=validate_test$HasDetections, pred_auc2)
t
accuracy<-t[2,2]/(t[2,1]+t[2,2])
false_negative<-t[2,1]/(t[2,1]+t[2,2])
accuracy
false_negative

#Load in the test set
test<-read_feather("C:/Data/microsoft-malware-prediction/test.feather")
#Put IDs in the own vector
MachineIdentifier<-test$MachineIdentifier
#Remove ID column
test<-test[,-1]
#Remove number of charges column
test<-test[,-42]
#Predict using test set
glm_fitted<-suppressWarnings(predict(glm_model,newdata=test,type="response"))
gbfit <- predict(object=cv_gbm,test,type='prob')
gbfit<-gbfit[,2]
pred_auc<-((1-(0/10))*gbfit)+((0/10)*glm_fitted)
pred<-cbind(MachineIdentifier,pred_auc)
colnames(pred)<-c("MachineIdentifier","HasDetections")

#Bring predictions into local session
pause()
getRemoteObject("pred")
write.csv(pred,"predictions.csv",row.names = F)