-
Notifications
You must be signed in to change notification settings - Fork 0
/
Legal Data.R
120 lines (87 loc) · 3.52 KB
/
Legal Data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
legal <- read.csv("https://query.data.world/s/5tpkrphqrkh5chf6okk4d2zi2tuqyo", header = TRUE, stringsAsFactors = FALSE)
str(legal)
attach(legal)
#-------------------------------------------------------------------------------
Incident.Date <- as.POSIXlt(Incident.Date, tz = "GMT","%m/%d/%Y %I:%M:%S %p")
head(Incident.Date,10); class(Incident.Date)
#-------------------------------------------------------------------------------
range(Incident.Date)
days <- as.numeric(max(Incident.Date) - min(Incident.Date))
months <- round(days/30)
hours <- days*24
cat("The total time period of recorded incidents is",
months, "months or", days, "days or", hours, "hours.")
#-------------------------------------------------------------------------------
median(Incident.Date)
#-------------------------------------------------------------------------------
Incident.Date[round(length(Incident.Date)/2)]
#-------------------------------------------------------------------------------
which.max(table(factor(Incident.Date)))
table(factor(Incident.Date))["2014-08-28"]
#-------------------------------------------------------------------------------
ampm <- factor(ifelse(Incident.Date$hour < 12, "AM","PM"))
ampm
levels(ampm)[2] <- "PM"
levels(ampm)
which.max(table(ampm))
which.max(table(Department, ampm))
rownames(table(Department, ampm))[3]
addmargins(table(Department, ampm))
round(addmargins(prop.table(table(Department, ampm)))*100,1)
#-------------------------------------------------------------------------------
Department <- factor(Department)
levels(Department)[1] <- "WPD"
table <- sort((table(Department, useNA = "ifany")))
names(dimnames(table)) <- "Department"
length(table)
addmargins(table)
par(mar = c(7,15,10,5), bg="linen")
barplot(table,
#orientation
horiz = TRUE,
las = 1,
xlim = c(0, max(table(Department))),
ylim = range(pretty(c(0, Department))),
#text
main = "Recorded incidents\nper Department",
sub = "City of Austin, 2011 - 2015",
#size
cex.main = 1,
#colors
col.axis = "gray15",
col.main = "gray15",
col.sub = "gray15",
col = "gray",
border = "gray"
)
grid(nx = NULL, ny = NA, lwd = 2, lty = 2, col = "gray60")
#-------------------------------------------------------------------------------
Amount <- as.numeric(substring(Amount, 2, nchar(Amount)))
head(Amount); class(Amount)
summary(Amount)
#NA treatment
Amount[is.na(Amount)] <- median(Amount, na.rm = TRUE)
#outlier treatment
Q3 <- unname(quantile(Amount)[4])
outliers <- which(Amount > Q3 + 1.5 * IQR(Amount))
for(i in 1:length(Amount[outliers])){
Amount[outliers][i] <- median(Amount)
}
which.max(tapply(Amount, Category, sum))
tapply(Amount, Category, sum)["00 Auto"]
#-------------------------------------------------------------------------------
which.max(tapply(Amount, Category, mean))
#-------------------------------------------------------------------------------
head(Claim.Name,35)
list1 <- regmatches(Claim.Name, regexpr("(?<=,) +(?!Inc)[a-zA-Z]{2,30}", Claim.Name, perl = TRUE))
list1
list1 <- regmatches(list1, regexpr("[a-zA-Z]{2,30}", list1, perl = TRUE))
list1
list2 <- regmatches(Claim.Name, regexpr("(?<=and )[a-zA-Z]{2,30}", Claim.Name, perl = TRUE))
list2
firstnames <- factor(c(list1, list2))
nlevels(firstnames)
which.max(table(firstnames))
table(firstnames)["Jennifer"]
Claim.Name[grep("Jennifer", Claim.Name)]
detach(legal)