-
Notifications
You must be signed in to change notification settings - Fork 1
/
MRS-Hackathon-ArrivalDelayModeling.R
280 lines (264 loc) · 9.19 KB
/
MRS-Hackathon-ArrivalDelayModeling.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
rm(list = ls())
set.seed(1)
setwd("D:/rHackathon/")
# we need packages, so let's install them
packages <- c("dplyr", "ggplot2", "stringr", "tidyr", "rmarkdown", "knitr",
"assertr", "glmnet", "devtools")
for (package in packages) {
if (!require(package, character.only = TRUE)) {
install.packages(package, character.only = TRUE)
library(package, character.only = TRUE)
}
}
library("RevoScaleR")
# dplyrXdf is on github, using devtools to install
devtools::install_github("Hong-Revo/dplyrXdf")
library("dplyrXdf")
# I created a lookup table for airline names, ended up not using it
airlineNameTable <- data.frame(
airlineCode = c("PS", "TW", "UA", "WN", "EA", "HP", "NW", "PA (1)", "PI",
"CO", "DL", "AA", "US", "AS", "ML (1)", "AQ", "MQ", "OO",
"XE", "TZ", "EV", "FL", "B6", "DH", "HA", "OH", "F9", "YV",
"9E"),
airlineName = c("Ukraine International Airlines",
"Trans World Airways LLC",
"United Air Lines Inc.",
"Southwest Airlines Co.",
"Eastern Air Lines Inc.",
"America West Airlines Inc.",
"Northwest Airlines Inc.",
"Pan American World Airways",
"Piedmont Aviation Inc.",
"Continental Air Lines Inc.",
"Delta Air Lines Inc.",
"American Airlines Inc.",
"US Airways Inc.",
"Alaska Airlines Inc.",
"Midway Airlines Inc.",
"Aloha Airlines Inc.",
"Envoy Air",
"SkyWest Airlines Inc.",
"ExpressJet Airlines Inc.",
"ATA Airlines d/b/a ATA",
"ExpressJet Airlines Inc.",
"AirTran Airways Corporation",
"JetBlue Airways",
"Independence Air",
"Hawaiian Airlines Inc.",
"PSA Airlines Inc.",
"Frontier Airlines Inc.",
"Mesa Airlines Inc.",
"Endeavor Air Inc."),
stringsAsFactors = FALSE)
### Most RevoScaleR functions start with rx. The rxGetInfo() function examines the metadata on a dataset
### We cannot use str() since data are stored on disk.
rxGetInfo("AirlineDataSubsample.xdf", getVarInfo = TRUE, numRows = 5)
rxGetInfo("AirlineData87to08.xdf", getVarInfo = TRUE, numRows = 5)
# rxDataStep performs data transformations.
# Here we throw out any flights not originating in airports of interest
rxDataStep(inData = "AirlineData87to08.xdf",
outFile = "mainData.xdf",
#outFile = "sample.xdf",
#rowSelection = (Year == 2008) & (Origin == "SEA"),
rowSelection = (Origin %in% c("SEA", "BOS", "SJC", "SFO")),
overwrite = TRUE,
reportProgress = 1)
# clean up missing values and keep only columns we'll need later
# the last argument makes sure we materialize a large data frame in RAM
df <- rxDataStep(inData = "sample.xdf",
rowSelection = !is.na(DepDelay) &
!is.na(DepTime) & !is.na(ArrDelay),
varsToKeep = c("Year",
"Month",
"DayofMonth",
"DayOfWeek",
"Origin",
"Dest",
"DepTime",
"DepDelay",
"ArrDelay",
"UniqueCarrier"
),
maxRowsByCols = Inf)
# xdf <- RxXdfData(file = "sample.xdf",
# another function to push the contents of xdf file into data frame in RAM
xdf <- RxXdfData(file = "mainData.xdf",
varsToKeep = c("Year",
"Month",
"DayofMonth",
"DayOfWeek",
"Origin",
"Dest",
"DepTime",
"DepDelay",
"ArrDelay",
"UniqueCarrier"))
#####################################################
### From here onwards the actual analysis happens ###
#####################################################
# data summary by carrier
carrierSummaryXdf <- xdf %>%
filter(
!is.na(DepDelay) & !is.na(DepTime) & !is.na(ArrDelay)
) %>%
group_by(UniqueCarrier) %>%
summarize(nRows = n()) %>%
mutate(airlineCode = as.character(UniqueCarrier)) %>%
inner_join(airlineNameTable, by = "airlineCode") %>%
arrange(desc(nRows))
# this step materializes the temp file from above
rxDataStep(inData = carrierSummaryXdf)
# data summary by destination
destinationSummaryXdf <- xdf %>%
filter(
!is.na(DepDelay) & !is.na(DepTime) & !is.na(ArrDelay)
) %>%
group_by(Dest) %>%
summarize(nRows = n()) %>%
arrange(desc(nRows))
rxDataStep(inData = destinationSummaryXdf)
# clean data: prune rows with missing values and truncate outliers
xdfClean <- xdf %>%
filter(
!is.na(DepDelay) & !is.na(DepTime) & !is.na(ArrDelay) &
(ArrDelay <= 120) & (ArrDelay >= -120)
) %>%
mutate(
DepHour = floor(DepTime),
DelayBin = floor(ArrDelay / 30),
Origin = ifelse(Origin %in% c("SJC", "SFO"), "SVC",
ifelse(Origin == "SEA", "SEA", "BOS"))
#DelayBin = ArrDelay
) %>%
group_by(
Year,
Origin,
#Dest,
#UniqueCarrier,
DayOfWeek,
DepHour,
DelayBin
) %>%
summarize(count = n()) %>%
arrange(
Year,
Origin,
#Dest,
#UniqueCarrier,
DayOfWeek,
DepHour,
DelayBin
)
rxDataStep(inData = xdfClean,
outFile = "xdfClean.xdf",
overwrite = TRUE)
rxGetInfo(xdfClean, getVarInfo = T, numRows = 2)
# prepare data for making impromptu histograms
totalsForBinsXdf <- xdfClean %>%
group_by(
Year,
Origin,
#Dest,
#UniqueCarrier,
DayOfWeek,
DepHour
) %>%
summarize(binCount = sum(count)) %>%
arrange(
Year,
Origin,
#Dest,
#UniqueCarrier,
DayOfWeek,
DepHour
)
rxDataStep(inData = totalsForBinsXdf,
outFile = "totalsForBinsXdf.xdf",
overwrite = TRUE)
rxGetInfo(totalsForBinsXdf, getVarInfo = T, numRows = 2)
# somehow native MRS function rxMerge() worked better than
# the dplyrXdf::inner_join()
proportionsXdf <- rxMerge(
inData1 = "xdfClean.xdf",
inData2 = "totalsForBinsXdf.xdf",
matchVars = intersect(names(rxDataStep("xdfClean.xdf")),
names(rxDataStep("totalsForBinsXdf")))
) %>%
mutate(
freq = 100 * round(count / binCount, digits = 4),
Hour = ifelse(DepHour > 12, DepHour - 12, DepHour),
Hour = ifelse(DepHour < 12 , str_c(Hour, " AM"), str_c(Hour, " PM"))
) %>%
arrange(
Year,
Origin,
#Dest,
#UniqueCarrier,
DayOfWeek,
DepHour,
DelayBin
)
rxDataStep(inData = proportionsXdf,
outFile = "proportionsXdf.xdf",
overwrite = TRUE)
cleanProportionsXdf <- proportionsXdf %>%
select(-one_of("Year", "count", "binCount"))
prop <- rxDataStep(inData = cleanProportionsXdf)
View(prop)
# histograms are computed, now bindata by departure time
mainData <- prop %>%
mutate(Departure = ifelse(DepHour < 6, "12AM to 06AM",
ifelse(DepHour < 12, "06AM to 12PM",
ifelse(DepHour < 18, "12PM to 06PM", "06PM to 12AM"))),
Departure = factor(Departure,
levels = c("12AM to 06AM", "06AM to 12PM",
"12PM to 06PM", "06PM to 12AM"),
ordered = TRUE)
) %>%
group_by(
Origin,
Departure,
DayOfWeek,
DelayBin
) %>%
summarize(freq = mean(freq)) #%>%
#filter(DepHour == 0)
# making exhaustive grid for plotting
mainGrid <- expand.grid(
unique(mainData[["Origin"]]),
unique(mainData[["Departure"]]),
unique(mainData[["DayOfWeek"]]),
unique(mainData[["DelayBin"]])
)
names(mainGrid) <- c("Origin", "Departure", "DayOfWeek", "DelayBin")
forPlot <- merge(mainGrid, mainData, all.x = TRUE,
by = c("Origin", "Departure", "DayOfWeek", "DelayBin")) %>%
mutate(freq = ifelse(is.na(freq), 0, freq))
# using ggplot to produce plots
plotBOS <- ggplot(data = filter(forPlot, Origin == "BOS"),
aes(x = DelayBin,
y = freq)
) +
#facet_grid(DayOfWeek ~ .) +
facet_grid(Departure ~ DayOfWeek) +
geom_line() + geom_point() +
geom_vline(xintercept = 0, color = "blue")
plotSEA <- ggplot(data = filter(forPlot, Origin == "SEA"),
aes(x = DelayBin,
y = freq)
) +
#facet_grid(DayOfWeek ~ .) +
facet_grid(Departure ~ DayOfWeek) +
geom_line() + geom_point() +
geom_vline(xintercept = 0, color = "blue")
plotSVC <- ggplot(data = filter(forPlot, Origin == "SVC"),
aes(x = DelayBin,
y = freq)
) +
#facet_grid(DayOfWeek ~ .) +
facet_grid(Departure ~ DayOfWeek) +
geom_line() + geom_point() +
geom_vline(xintercept = 0, color = "blue")
print(plotBOS)
print(plotSEA)
print(plotSVC)