-
Notifications
You must be signed in to change notification settings - Fork 0
/
naivebayes.R
54 lines (50 loc) · 1.9 KB
/
naivebayes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#
# naivebayes.R - a simple Naive Bayes spam filter
# (Solution to CSE312 17wi hw4, problem 5)
#
# Method: given two directories, one each of (hand-labeled) spam and
# non-spam ("ham") emails, we estimate ... (more detail to come...)
# Each message in a test set directory is classified based on whether
# its contents favors S or H based on unique words contained in it,
# based on ...(more detail to come...).
#
# Output: a list of test message file names and predicted classifications,
# one pair per line, plus some ancillary summaries.
#
# Input: three directory names, built-in as constants below, specifying the
# directories containing training spam, training non-spam and test messages.
dir.train.spam <- 'data/train/spam/'
dir.train.ham <- 'data/train/ham/'
dir.test <- 'data/test/'
####
#
# token_set(filename) - Given a file (full path or cwd-relative), return
# a vector containing the unique tokens (i.e., white-space-separated
# strings) in the file.
#
# E.g., given a file containing:
#
# Subject: Get rid of your student loans
# Hi there,
# If you work for us, we will give you money
# to repay your student loans. You will be
# debt free!
# FakePerson_22393
#
# This function would return the length 26 vector:
#
# [1] "Subject:" "Get" "rid" "of" "your"
# [6] "student" "loans" "Hi" "there," "If"
#[11] "you" "work" "for" "us," "we"
#[16] "will" "give" "money" "to" "repay"
#[21] "loans." "You" "be" "debt" "free!"
#[26] "FakePerson_22393"
#
token_set <- function(filename){
return(unique(scan(file=filename,what='character',quote='',quiet=TRUE)))
}
####
#
# You code goes here...
#
###