-
Notifications
You must be signed in to change notification settings - Fork 29
/
Reducing data complexity.R
61 lines (50 loc) · 1.67 KB
/
Reducing data complexity.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
brand.ratings<-read.csv('http://goo.gl/IQl8nc')
# Rescale the data
# To make the data more comparable across individuals and samples. Common practice is to centre the variable by subtracting its mean from every observation, then rescaling those centred values as units of standard deviation. This is known as standardising, normlising or Z scoring the data.
# basic formula is
x.sc<-(x-mean(x) / sd(x))
# But to make it easier you can just use scale()
x.sc<-scale(x)
# Scaling our data
brand.sc<-brand.ratings
brand.sc[, 1:9]<-scale(brand.sc[, 1:9])
# Looking at correlations
library(corrplot)
corrplot(
cor(brand.sc[, 1:9]),
order="hclust"
)
# Here we can see some clusters of 2-3 variables which will be examined later.
# Aggregating mean ratings by brand.
brand.mean<-aggregate(. ~ brand, data=brand.sc, mean)
# Tidying up - naming the rows using the brand names in the first column, then removing that redundant column.
rownames(brand.mean)<-brand.mean[,1]
brand.mean<-brand.mean[, -1]
# Visualising results using a heatmap
library(gplots)
library(RColorBrewer)
heatmap.2(as.matrix(brand.mean),
col=brewer.pal(9, "GnBu"),
trace="none",
key=FALSE,
dend="none",
main="\n\n\n\n\nBrand attributes"
)
heatmap.2(as.matrix(segments.sc),
col=brewer.pal(9, "GnBu"),
trace="none",
key=FALSE,
dend="none",
main="\n\n\n\n\nCustomer Segments"
)
gplots:::heatmap.2(as.matrix(segments.anon),
col=brewer.pal(9, "OrRd"),
trace="none",
key=FALSE,
dend="none",
main="\n\n\n\n\nCustomer Segments",
cexRow=1,
cexCol=1,
lhei=c(1,1),
lwid=c(3,1)
)