Artifical Intelligence Model using Machine learning Algorithm to predict the ICO ( Cryptocurrencies ) offerings for Fundraising teams and Startups
rm(list=ls()) icobench<- read.csv("Downloads/Machinelearningcourseworkright.csv") View(icobench) str(icobench) icobench$success<-factor(icobench$success) str(icobench$success) sum(complete.cases(icobench)) sum(!complete.cases(icobench)) summary(icobench) View(icobench)
install.packages("VIM") library("VIM") aggr(icobench,numbers=TRUE,prop=FALSE) qicobench<-icobench[which(icobench$hasVideo==1&icobench$rating>=4.0),] hist(icobench$priceUSD) si_icobench<-icobench si_icobench$priceUSD[is.na(si_icobench$priceUSD)]<-mean(si_icobench$priceUSD,na.rm = TRUE) si_icobench hist(icobench$teamSize) si_icobench$teamSize[is.na(si_icobench$teamSize)]<-mean(si_icobench$teamSize,na.rm = TRUE)
si_icobench
str(icobench$success) si_icobench$success[is.na(si_icobench$success)]<-mean(si_icobench$success,na.rm=TRUE) View(si_icobench)
crowdfunding<-si_icobench View(crowdfunding) sum(complete.cases(si_icobench)) sum(!complete.cases(crowdfunding)) dup_crowfunding<-duplicated(crowdfunding$hasVideo) x<-si_icobench x<-x[-which(x$success%in%outliers),] table(x$success,x$minInvestment) table(data$success,data$hasReddit) boxplot(x$priceUSD,plot = FALSE)$out # priceUSD,teamsize,coinnum,dirtubutedpercentage are the outliers outlier<-boxplot(x$priceUSD,plot = FALSE)$out length(x$priceUSD) length(x$teamSize) boxplot(x$priceUSD) sum(complete.cases(x)) sum(!complete.cases(x)) outliers<-boxplot(x$teamSize,plot = FALSE)$out y<-x y<-y[-which(y$teamSize %in% outliers),] length(y) length(y$priceUSD) out<-boxplot(y$coinNum,plot = FALSE)$out z<-y z<-z[-which(z$coinNum %in% out),] length(z$teamSize) sum(!complete.cases(z)) outl<-boxplot(z$distributedPercentage,plot = FALSE)$out w<-z w<-w[-which(w$distributedPercentage %in% outl),] sum(complete.cases(w))
crowdfunding<-crowdfunding[-1]#Removing the id factor table(crowdfunding$success) crowdfunding$success<-as.character(crowdfunding$) View(crowdfunding) round(prop.table(table(crowdfunding$success))*100 , digits = 1) summary(crowdfunding[c("rating","priceUSD","teamSize")]) normalize <- function(x) { if(is.numeric(x)){ x= ((x - min(x)) / (max(x) - min(x))) } return(x) } crowdfunding_n<-as.data.frame(lapply(crowdfunding[2:10],normalize)) summary(crowdfunding_n$priceUSD) crowdfunding_train<-crowdfunding_n[1:2075,] crowdfunding_test<-crowdfunding_n[2076:2767,] crowdfunding_train_labels<-crowdfunding[1:2075,1] crowdfunding_test_labels<-crowdfunding[2076:2767,1] crowdfunding_train_labels install.packages("class") library(class)
K=45
#Using an odd number of K will reduce the chance of ending with a tie vote. crowdfunding_test_pred<-knn(train =crowdfunding_train,test = crowdfunding_test, cl=crowdfunding_train_labels, k=K ) #error while specifying the crowdfunding_train_labels install.packages("gmodels") library(gmodels) CrossTable(crowdfunding_test_labels,crowdfunding_test_pred, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE, dnn = c('predicted', 'actua l')) (542+3)/692
install.packages("Metrics") library(Metrics) mae(crowdfunding_test_pred,crowdfunding_test) library(ROCR) icobench_test_prob<-predict(crowdfunding_test_pred,crowdfunding_test,type = "raw")
options(scipen=999) head(icobench_test_prob) icobench_results<-data.frame(actual_type=crowdfunding_test_labels, predict_type=crowdfunding_test_pred, prob_yes=round(crowdfunding_test_prob[,1],5), prob_no=round(crowdfunding_test_prob[,2],5))
head(icobench_results) head(subset(icobench_results,actual_type!=predict_type)) CrossTable(icobench_results$actual_type,icobench_results$predict_type, dnn=c('actual', 'predict'), prop.chisq = FALSE, prop.t = FALSE, prop.r=FALSE)
install.packages("caret", dependencies = TRUE) library(caret) confusionMatrix(icobench_results$predict_type, icobench_results$actual_type, positive = "N") library(ROCR) ROC<-roc(crowdfunding_test_labels,as.numeric(crowdfunding_test_pred),measure = "tpr",x.measure = "fp") plot(ROC) auc(ROC) pred_object<-prediction(icobench_results$prob_yes,icobench_results$actual_type) roc_NB<-performance(pred_object,measure = "tpr",x.measure = "fpr") plot(roc_NB, main="ROC of success of companies for crowdfunding ",col="blue",lwd=2) abline(a = 0, b = 1, lwd = 2, lty = 2) auc_object_NB <- performance(pred_object, measure = "auc") auc_NB <- auc_object_NB@y.values[[1]]
rm(list=ls()) icobench<-read.csv("Downloads/MLNB.csv") View(icobench) icobench<-icobench[-1] View(icobench ) str(icobench) icobench$success<-factor(icobench$success) str(icobench$success) table(icobench$success) install.packages("tm") library(tm) icobench_corpus<-VCorpus(VectorSource(icobench$brandSlogan)) print(icobench_corpus) inspect(icobench_corpus[1:2]) as.character(icobench_corpus[1]) lapply(icobench_corpus[1:5], as.character)
icobench_corpus_clean<-tm_map(icobench_corpus,content_transformer(tolower))
as.character(icobench_corpus[[1]]) as.character(icobench_corpus_clean[[1]])
icobench_corpus_clean<-tm_map(icobench_corpus_clean,removeNumbers) icobench_corpus_clean <- tm_map(icobench_corpus_clean, removeWords, stopwords()) # remove stop words icobench_corpus_clean <- tm_map(icobench_corpus_clean, removePunctuation) # remove punctuat install.packages("SnowballC") library(SnowballC) icobench_corpus_clean<-tm_map(icobench_corpus_clean,stemDocument) icobench_corpus_clean<-tm_map(icobench_corpus_clean,stripWhitespace) lapply(icobench_corpus[1:3],as.character) lapply(icobench_corpus_clean[1:3],as.character) icobench_dtm <- DocumentTermMatrix(icobench_corpus_clean) dim(icobench_dtm) inspect(icobench_dtm[1:5, 1:10]) set.seed(123)
ratio=0.85 p_index=round(nrow(icobench_dtm)*ratio) icobench_dtm_train<-icobench_dtm[1:p_index,] icobench_dtm_test<-icobench_dtm[(p_index +1):nrow(icobench_dtm),] icobench_train_labels<-icobench[1:p_index,]$success icobench_test_labels<-icobench[(p_index +1):nrow(icobench_dtm),]$success prop.table(table(icobench_train_labels)) prop.table(table(icobench_test_labels)) findFreqTerms(icobench_dtm,10) icobench_Freq_words<-findFreqTerms(icobench_dtm,10) str(icobench_Freq_words) icobench_dtm_Freq_train<-icobench_dtm_train[,icobench_Freq_words] icobench_dtm_Freq_test<-icobench_dtm_test[,icobench_Freq_words]
convert_counts <- function(x) { x <- ifelse(x > 0, "Yes", "No") }
icobench_train<-apply(icobench_dtm_Freq_train, MARGIN = 2,convert_counts) icobench_test<-apply(icobench_dtm_Freq_test, MARGIN = 2,convert_counts) install.packages("e1071") library(e1071) icobench_classifier<-naiveBayes(icobench_train,icobench_train_labels) icobench_test_pred<-predict(icobench_classifier,icobench_test) library(gmodels) CrossTable(icobench_test_pred, icobench_test_labels, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE, dnn = c('predicted', 'actual')) confusionMatrix(icobench_test_pred,icobench_test_labels,positive = TRUE,dnn = c("predicted","actual")) (278+18)/415
icobench_classifier_laplace<-naiveBayes(icobench_train,icobench_train_labels,laplace =1) icobench_test_pred_laplace<-predict(icobench_classifier_laplace,icobench_test) CrossTable(icobench_test_pred_laplace, icobench_test_labels, prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE, dnn = c('predicted', 'actual')) #EVALUATION icobench_test_prob<-predict(icobench_classifier,icobench_test,type = "raw")
options(scipen=999) head(icobench_test_prob) icobench_results<-data.frame(actual_type=icobench_test_labels, predict_type=as.numeric(icobench_test_pred), prob_yes=round(icobench_test_prob[,1],10), prob_no=round(icobench_test_prob[,2],10))
head(icobench_results) head(subset(icobench_results,actual_type!=predict_type)) CrossTable(icobench_results$actual_type,icobench_results$predict_type, dnn=c('actual', 'predict'), prop.chisq = FALSE, prop.t = FALSE, prop.r=FALSE)
(278+18)/415 install.packages("caret", dependencies = TRUE)
library(caret) confusionMatrix(icobench_results$predict_type, icobench_results$actual_type, positive = "N")
library(pROC) plot(roc((as.numeric(icobench_test_pred),icobench_test_label,direction="<",col="yellow",lwd = 3 ,main="ROC cCurve"))) ROCurve<-roc(icobench_test_labels , as.numeric(icobench_test_pred,measure = "tpr",x.measure = "fp")) plot(ROCurve) auc(ROCurve) pred_object<-prediction(icobench_results$predict_type,icobench_results$actual_type) roc_NB<-performance(pred_object,measure = "tpr",x.measure = "fpr") plot(roc_NB, main="ROC of success of companies for crowdfunding ",col="blue",lwd=2)
abline(a = 0, b = 1, lwd = 2, lty = 2) auc_object_NB <- performance(pred_object, measure = "auc") auc_NB <- auc_object_NB@y.values[[1]] auc_NB
#Decision Tree Analysis
rm(list=ls()) icobench<- read.csv("Downloads/MLDT.csv") View(icobench) str(icobench) icobench$success<-factor(icobench$success)
str(icobench$success) sum(complete.cases(icobench)) sum(!complete.cases(icobench)) summary(icobench) install.packages("VIM") library("VIM") aggr(icobench,numbers=TRUE,prop=FALSE) qicobench<-icobench[which(icobench$hasVideo==1&icobench$rating>=4.0),] hist(icobench$priceUSD) si_icobench<-icobench si_icobench$priceUSD[is.na(si_icobench$priceUSD)]<-mean(si_icobench$priceUSD,na.rm = TRUE) si_icobench hist(icobench$teamSize) si_icobench$teamSize[is.na(si_icobench$teamSize)]<-mean(si_icobench$teamSize,na.rm = TRUE) sum(complete.cases(si_icobench)) ICO<-si_icobench view(ICO)
table(ICO$startDate) table(ICO$endDate) summary(ICO$platform) ICO$hasGithub<-factor(ICO$hasGithub)
smp_size<-floor(0.75*nrow(ICO)) sample(11,6) sample(19,5) set.seed(12345) sample(10,5) train_ico<-sample(nrow(ICO),smp_size) crowdfunding_train<-ICO[train_ico,] crowdfunding_test<-ICO[-train_ico,] #-------------------------------------------------- #2. Training a model on the data: basic #-------------------------------------------------- #We will use the C5.0 algorithm in the #C50 package for training our decision tree model.ICO ICO$hasGithub<-factor(ICO$hasGithub) install.packages("C50") library(C50) library(tidyverse) icobench_model<-C5.0(crowdfunding_train[6] ,crowdfunding_train$success) icobench_model
summary(icobench_model) crowdfunding_pred<-predict(icobench_model,crowdfunding_test) install.packages("Metrics") library('Metrics') mae(crowdfunding_pred,crowdfunding_test$success)
#This creates a vector of predicted class values, #which we can compare to the actual class values #using the CrossTable() function in the gmodels package. #Setting the prop.c and prop.r parameters to FALSE #removes the column and row percentages from the table. #The remaining percentage (prop.t) #indicates the proportion of records in the cell out of the total number of records. library(gmodels) CrossTable(crowdfunding_pred,crowdfunding_test$success, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('predicted default', 'actual')) confusionMatrix(crowdfunding_pred,crowdfunding_test$success,positive="Y")
crowdfunding_boost<-C5.0(select(crowdfunding_train[1]) ,crowdfunding_train$success,trials = 10) crowdfunding_boost summary(crowdfunding_boost)
crowdfunding_boost_pred<-predict(crowdfunding_boost,crowdfunding_test) CrossTable(crowdfunding_boost_pred,crowdfunding_test$success, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('predicted default', 'actual'))
rm(list=ls()) icobench<-read.csv("Downloads/Machinelearningcourseworkright.csv") str(icobench) icobench<-icobench[-1] view(icobench) icobench$success<-factor(icobench$success)
str(icobench$success) smp_size<-floor(0.75*nrow(icobench)) set.seed(987) train_ind<-sample(nrow(icobench),smp_size)
icobench_train<-icobench[train_ind,]
icobench_test<-icobench[-train_ind,] library(rpart) m.rpart<-rpart(success~.,data = icobench) summary(m.rpart) install.packages('rpart.plot') library(rpart.plot) rpart.plot(m.rpart, digits =2) rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101) p.rpart<-predict(m.rpart,icobench_test) library(gmodels) CrossTable(p.rpart,icobench_test$success,prop.chisq = FALSE, prop.c = FALSE, prop.r=FALSE ,dnn=c("predicted","actual"))
install.packages("Metrics") library(Metrics) mae(p.rpart,icobench_test$success) rmse(p.rpart,icobench_test$success) baseline<-mean(icobench_train$success) mae(baseline,icobench_test$success) rmse(baseline,icobench_test$success) library(Cubist) library(tidyverse) m.cubist<-cubist(x=select(icobench_train,-success),y=icobench_train$success) summary(m.cubist) p.cubist<-predict(m.cubist,select(icobench_test,-success)) mae(icobench_test$success,p.cubist) rmse(icobench_test$success,p.cubist)
#random Forest Clasifiers install.packages("randomForest") library(randomForest) m.rf<-randomForest(icobench_test,y=icobench_train[1], ntree = 20) p.rf<-predict(m.rf,icobench[train_ind==2,])