-
Notifications
You must be signed in to change notification settings - Fork 2
/
Movie review to test accuracy.R
75 lines (66 loc) · 4.23 KB
/
Movie review to test accuracy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#import libraries to work with
library(plyr)
library(stringr)
library(e1071)
#load up word polarity list and format it
afinn_list <- read.delim(file='G:/Mita/Twitter Sentiment Analysis/AFINN/AFINN-111.txt', header=FALSE, stringsAsFactors=FALSE)
names(afinn_list) <- c('word', 'score')
afinn_list$word <- tolower(afinn_list$word)
#categorize words as very negative to very positive and add some movie-specific words
vNegTerms <- afinn_list$word[afinn_list$score==-5 | afinn_list$score==-4]
negTerms <- c(afinn_list$word[afinn_list$score==-3 | afinn_list$score==-2 | afinn_list$score==-1], "second-rate", "moronic", "third-rate", "flawed", "juvenile", "boring", "distasteful", "ordinary", "disgusting", "senseless", "static", "brutal", "confused", "disappointing", "bloody", "silly", "tired", "predictable", "stupid", "uninteresting", "trite", "uneven", "outdated", "dreadful", "bland")
posTerms <- c(afinn_list$word[afinn_list$score==3 | afinn_list$score==2 | afinn_list$score==1], "first-rate", "insightful", "clever", "charming", "comical", "charismatic", "enjoyable", "absorbing", "sensitive", "intriguing", "powerful", "pleasant", "surprising", "thought-provoking", "imaginative", "unpretentious")
vPosTerms <- c(afinn_list$word[afinn_list$score==5 | afinn_list$score==4], "uproarious", "riveting", "fascinating", "dazzling", "legendary")
#load up positive and negative sentences and format
posText <- read.delim(file='G:/Mita/Twitter Sentiment Analysis/rt-polaritydata/rt-polarity_pos.txt', header=FALSE, stringsAsFactors=FALSE)
posText <- posText$V1
posText <- unlist(lapply(posText, function(x) { str_split(x, "\n") }))
negText <- read.delim(file='G:/Mita/Twitter Sentiment Analysis/polarityData/rt-polaritydata/rt-polarity_neg.txt', header=FALSE, stringsAsFactors=FALSE)
negText <- negText$V1
negText <- unlist(lapply(negText, function(x) { str_split(x, "\n") }))
#function to calculate number of words in each category within a sentence
sentimentScore <- function(sentences, vNegTerms, negTerms, posTerms, vPosTerms){
final_scores <- matrix('', 0, 5)
scores <- laply(sentences, function(sentence, vNegTerms, negTerms, posTerms, vPosTerms){
initial_sentence <- sentence
#remove unnecessary characters and split up by word
sentence <- gsub('[[:punct:]]', '', sentence)
sentence <- gsub('[[:cntrl:]]', '', sentence)
sentence <- gsub('\\d+', '', sentence)
sentence <- tolower(sentence)
wordList <- str_split(sentence, '\\s+')
words <- unlist(wordList)
#build vector with matches between sentence and each category
vPosMatches <- match(words, vPosTerms)
posMatches <- match(words, posTerms)
vNegMatches <- match(words, vNegTerms)
negMatches <- match(words, negTerms)
#sum up number of words in each category
vPosMatches <- sum(!is.na(vPosMatches))
posMatches <- sum(!is.na(posMatches))
vNegMatches <- sum(!is.na(vNegMatches))
negMatches <- sum(!is.na(negMatches))
score <- c(vNegMatches, negMatches, posMatches, vPosMatches)
#add row to scores table
newrow <- c(initial_sentence, score)
final_scores <- rbind(final_scores, newrow)
return(final_scores)
}, vNegTerms, negTerms, posTerms, vPosTerms)
return(scores)
}
#build tables of positive and negative sentences with scores
posResult <- as.data.frame(sentimentScore(posText, vNegTerms, negTerms, posTerms, vPosTerms))
negResult <- as.data.frame(sentimentScore(negText, vNegTerms, negTerms, posTerms, vPosTerms))
posResult <- cbind(posResult, 'positive')
colnames(posResult) <- c('sentence', 'vNeg', 'neg', 'pos', 'vPos', 'sentiment')
negResult <- cbind(negResult, 'negative')
colnames(negResult) <- c('sentence', 'vNeg', 'neg', 'pos', 'vPos', 'sentiment')
#combine the positive and negative tables
results <- rbind(posResult, negResult)
#run the naive bayes algorithm using all four categories
classifier <- naiveBayes(results[,2:5], results[,6])
#display the confusion table for the classification ran on the same data
confTable <- table(predict(classifier, results), results[,6], dnn=list('predicted','actual'))
confTable
#run a binomial test for confidence interval of results
binom.test(confTable[1,1] + confTable[2,2], nrow(results), p=0.5)