-
Notifications
You must be signed in to change notification settings - Fork 0
/
when-riders-meet-drivers.sql
241 lines (227 loc) · 14.5 KB
/
when-riders-meet-drivers.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
-- ****************************************************************************************************************************
-- when_riders_meet_drivers.sql
-- Purpose: understand ride-hailing seasonality of supply vs seasonality of demand, in hours, for each day of the week
-- Approach: using "Chicago Taxi Trips" from BiqQuery Public Data, assuming it shows a behavior similar to ride-hailing
-- Dialect: BigQuery
-- Author: Isis Santos Costa
-- Date: 2023-04-18
-- ****************************************************************************************************************************
-------------------------------------------------------------------------------------------------------------------------------
-- Function 1 • Get day of week from datetime
-------------------------------------------------------------------------------------------------------------------------------
CREATE TEMPORARY FUNCTION dayOfWeek(x DATETIME) AS (
['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'][ORDINAL(EXTRACT(DAYOFWEEK FROM x))]
);
-------------------------------------------------------------------------------------------------------------------------------
-- CTE 1 • Data collection: fetching data from the original table
-------------------------------------------------------------------------------------------------------------------------------
WITH raw_data AS (
SELECT
unique_key -- REQUIRED STRING Unique identifier for the trip.
, taxi_id -- REQUIRED STRING A unique identifier for the taxi.
, trip_start_timestamp -- NULLABLE TIMESTAMP When the trip started, rounded to nearest 15 minutes.
, trip_seconds -- NULLABLE INTEGER Duration of the trip in seconds.
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE trip_seconds > 0
)
-------------------------------------------------------------------------------------------------------------------------------
-- CTE 2 • Data cleaning: (a) finding interquartile ranges (IQR) of trip_seconds
-------------------------------------------------------------------------------------------------------------------------------
, data_cleaning_trip_seconds_iqr AS (
SELECT
APPROX_QUANTILES(trip_seconds, 4)[OFFSET(1)] AS trip_seconds_iqr_lower
, APPROX_QUANTILES(trip_seconds, 4)[OFFSET(2)] AS trip_seconds_med
, APPROX_QUANTILES(trip_seconds, 4)[OFFSET(3)] AS trip_seconds_iqr_upper
, APPROX_QUANTILES(trip_seconds, 4)[OFFSET(3)] - APPROX_QUANTILES(trip_seconds, 4)[OFFSET(1)] AS trip_seconds_iqr
FROM raw_data
)
-------------------------------------------------------------------------------------------------------------------------------
-- CTE 3 • Data cleaning: (i) converting from UTC to Chicago Time, (ii) Excluding outliers: duration (trip_seconds)
-------------------------------------------------------------------------------------------------------------------------------
, data_cleaned_from_duration_outliers AS (
SELECT
unique_key
, taxi_id
, DATETIME(trip_start_timestamp, 'America/Chicago') AS trip_start_local_datetime
, trip_seconds
FROM raw_data, data_cleaning_trip_seconds_iqr
WHERE (trip_seconds BETWEEN trip_seconds_iqr_lower - 1.5 * trip_seconds_iqr
AND trip_seconds_iqr_upper + 1.5 * trip_seconds_iqr)
)
-------------------------------------------------------------------------------------------------------------------------------
-- CTE 4 • Data cleaning: checking results from cleaning (i) + (ii)
-------------------------------------------------------------------------------------------------------------------------------
, data_cleaning_duration_outliers_results AS (
SELECT
'raw_data' AS cte
, COUNT(*) record_cnt
, APPROX_QUANTILES(trip_seconds, 4)[OFFSET(2)] median_trip_seconds
, AVG(trip_seconds) avg_trip_seconds
, MIN(trip_seconds) min_trip_seconds
, MAX(trip_seconds) max_trip_seconds
, APPROX_QUANTILES(trip_seconds, 4)[OFFSET(1)] q1_trip_seconds
, APPROX_QUANTILES(trip_seconds, 4)[OFFSET(3)] q3_trip_seconds
, ( APPROX_QUANTILES(trip_seconds, 4)[OFFSET(3)] - APPROX_QUANTILES(trip_seconds, 4)[OFFSET(1)] ) iqr_trip_seconds
FROM raw_data
UNION ALL
SELECT
'data_cleaned_from_duration_outliers' AS cte
, COUNT(*) record_cnt
, APPROX_QUANTILES(trip_seconds, 4)[OFFSET(2)] median_trip_seconds
, AVG(trip_seconds) avg_trip_seconds
, MIN(trip_seconds) min_trip_seconds
, MAX(trip_seconds) max_trip_seconds
, APPROX_QUANTILES(trip_seconds, 4)[OFFSET(1)] q1_trip_seconds
, APPROX_QUANTILES(trip_seconds, 4)[OFFSET(3)] q3_trip_seconds
, ( APPROX_QUANTILES(trip_seconds, 4)[OFFSET(3)] - APPROX_QUANTILES(trip_seconds, 4)[OFFSET(1)] ) iqr_trip_seconds
FROM data_cleaned_from_duration_outliers
)
-------------------------------------------------------------------------------------------------------------------------------
-- CTE 5 • Data cleaning: (b) aggregating partially clean data, preparing to exclude extreme hours (esp. peaks)
-------------------------------------------------------------------------------------------------------------------------------
, data_cleaning_agg AS (
SELECT
DATETIME_TRUNC(trip_start_local_datetime, HOUR) AS trip_start_local_datehour
, COUNT(DISTINCT unique_key) AS hourly_trip_cnt
, COUNT(DISTINCT taxi_id) AS hourly_taxi_cnt
FROM data_cleaned_from_duration_outliers
GROUP BY trip_start_local_datehour
)
-------------------------------------------------------------------------------------------------------------------------------
-- CTE 6 • Data cleaning: (c) finding interquartile ranges (IQR) of hourly_trip_cnt, hourly_taxi_cnt
-------------------------------------------------------------------------------------------------------------------------------
, data_cleaning_trips_taxis_iqr AS (
SELECT
APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(1)] AS hourly_trip_cnt_iqr_lower
, APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(2)] AS hourly_trip_cnt_med
, APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(3)] AS hourly_trip_cnt_iqr_upper
, APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(3)] - APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(1)] AS hourly_trip_cnt_iqr
, APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(1)] AS hourly_taxi_cnt_iqr_lower
, APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(2)] AS hourly_taxi_cnt_med
, APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(3)] AS hourly_taxi_cnt_iqr_upper
, APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(3)] - APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(1)] AS hourly_taxi_cnt_iqr
FROM data_cleaning_agg
)
-------------------------------------------------------------------------------------------------------------------------------
-- CTE 7 • Data cleaning: (iii) based on hourly_trip_cnt, hourly_taxi_cnt, remove extreme hours from pre-cleaned (i)+(ii) data
-------------------------------------------------------------------------------------------------------------------------------
, clean_data AS (
SELECT
trip_start_local_datetime
, unique_key
, taxi_id
, trip_seconds
FROM data_cleaned_from_duration_outliers, data_cleaning_trips_taxis_iqr
JOIN data_cleaning_agg
ON data_cleaning_agg.trip_start_local_datehour = DATETIME_TRUNC(trip_start_local_datetime, HOUR)
WHERE (hourly_trip_cnt BETWEEN hourly_trip_cnt_iqr_lower - 1.5 * hourly_trip_cnt_iqr
AND hourly_trip_cnt_iqr_upper + 1.5 * hourly_trip_cnt_iqr)
AND (hourly_taxi_cnt BETWEEN hourly_taxi_cnt_iqr_lower - 1.5 * hourly_taxi_cnt_iqr
AND hourly_taxi_cnt_iqr_upper + 1.5 * hourly_taxi_cnt_iqr)
)
-------------------------------------------------------------------------------------------------------------------------------
-- CTE 8 • Data cleaning: (c) aggregating final clean data
-------------------------------------------------------------------------------------------------------------------------------
, data_cleaning_agg_clean_data AS (
SELECT
DATETIME_TRUNC(trip_start_local_datetime, HOUR) AS trip_start_local_datehour
, COUNT(DISTINCT unique_key) AS hourly_trip_cnt
, COUNT(DISTINCT taxi_id) AS hourly_taxi_cnt
FROM clean_data
GROUP BY trip_start_local_datehour
)
-------------------------------------------------------------------------------------------------------------------------------
-- CTE 9 • Data cleaning: checking results from cleaning (iii)
-------------------------------------------------------------------------------------------------------------------------------
, data_cleaning_results AS (
SELECT
'data_cleaning_agg' AS cte
, COUNT(*) record_cnt
, APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(2)] median_hourly_trip_cnt
, AVG(hourly_trip_cnt) avg_hourly_trip_cnt
, MIN(hourly_trip_cnt) min_hourly_trip_cnt
, MAX(hourly_trip_cnt) max_hourly_trip_cnt
, APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(1)] q1_hourly_trip_cnt
, APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(3)] q3_hourly_trip_cnt
, ( APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(3)] - APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(1)] ) iqr_hourly_trip_cnt
, APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(2)] median_hourly_taxi_cnt
, AVG(hourly_taxi_cnt) avg_hourly_taxi_cnt
, MIN(hourly_taxi_cnt) min_hourly_taxi_cnt
, MAX(hourly_taxi_cnt) max_hourly_taxi_cnt
, APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(1)] q1_hourly_taxi_cnt
, APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(3)] q3_hourly_taxi_cnt
, ( APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(3)] - APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(1)] ) iqr_hourly_taxi_cnt
FROM data_cleaning_agg
UNION ALL
SELECT
'data_cleaning_agg_clean_data' AS cte
, COUNT(*) record_cnt
, APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(2)] median_hourly_trip_cnt
, AVG(hourly_trip_cnt) avg_hourly_trip_cnt
, MIN(hourly_trip_cnt) min_hourly_trip_cnt
, MAX(hourly_trip_cnt) max_hourly_trip_cnt
, APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(1)] q1_hourly_trip_cnt
, APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(3)] q3_hourly_trip_cnt
, ( APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(3)] - APPROX_QUANTILES(hourly_trip_cnt, 4)[OFFSET(1)] ) iqr_hourly_trip_cnt
, APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(2)] median_hourly_taxi_cnt
, AVG(hourly_taxi_cnt) avg_hourly_taxi_cnt
, MIN(hourly_taxi_cnt) min_hourly_taxi_cnt
, MAX(hourly_taxi_cnt) max_hourly_taxi_cnt
, APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(1)] q1_hourly_taxi_cnt
, APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(3)] q3_hourly_taxi_cnt
, ( APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(3)] - APPROX_QUANTILES(hourly_taxi_cnt, 4)[OFFSET(1)] ) iqr_hourly_taxi_cnt
FROM data_cleaning_agg_clean_data
)
-------------------------------------------------------------------------------------------------------------------------------
-- CTE 10 • Data analysis: typical duration of trips, according to clean data
-------------------------------------------------------------------------------------------------------------------------------
, typical_trip_seconds AS
(SELECT APPROX_QUANTILES(trip_seconds, 4)[OFFSET(1)] AS median_trip_seconds FROM clean_data)
-------------------------------------------------------------------------------------------------------------------------------
-- CTE 11 • Data analysis: hourly count of trips (demand) + (estimated) Hourly count of possible trips (supply)
-------------------------------------------------------------------------------------------------------------------------------
-- Model
-- hourly_trips_supply: total #trips in 1hr that could have happened, based on drivers' availability and typical trip duration
-- hourly_trips_supply =
-- = estimated_number_of_taxis_available_in_the_hour × potential_number_of_trips_per_hour_per_driver
--
-- estimated_number_of_taxis_available_in_the_hour =
-- = number_of_taxis_w_trips_in_the_hour ÷ drivers_typical_idle_time
--
-- potential_number_of_trips_per_hour_per_driver =
-- = 60 ÷ typical_trip_minutes
-------------------------------------------------------------------------------------------------------------------------------
-- Assumption
-- drivers_idle_time = 2/3
-- Ref.: https://www.uberpeople.net/threads/what-is-your-idle-time-and-idle-running-in-km-as-uber-driver.146607/
-------------------------------------------------------------------------------------------------------------------------------
-- Note on impact of Model & Assumption on findings
-- These are only applied in getting realistic absolute numbers, not impacting findings of the analysis [based on proportion]
-------------------------------------------------------------------------------------------------------------------------------
, hourly_supply_demand AS (
SELECT
DATETIME_TRUNC(trip_start_local_datetime, HOUR) AS trip_start_local_datehour
, dayOfWeek(trip_start_local_datetime) AS trip_start_local_dayofweek
, EXTRACT(HOUR FROM trip_start_local_datetime) AS trip_start_local_hour
, (median_trip_seconds / 60.0) AS typical_trip_minutes
, CAST(FLOOR(60 / (median_trip_seconds / 60.0)) AS INT64) AS potential_number_of_trips_per_hour_per_driver
, COUNT(DISTINCT unique_key) AS hourly_trips_demand
, CAST(FLOOR((COUNT(DISTINCT taxi_id)/(2/3)) * FLOOR(60/(AVG(median_trip_seconds)/60.0))) AS INT64) AS hourly_trips_supply
FROM clean_data, typical_trip_seconds
GROUP BY 1, 2, 3, 4, 5
)
-------------------------------------------------------------------------------------------------------------------------------
-- Unit tests / Final query
-------------------------------------------------------------------------------------------------------------------------------
-- SELECT COUNT(*) AS record_cnt FROM raw_data -- 194,639,776
-- SELECT * FROM data_cleaning_trip_seconds_iqr
-- SELECT COUNT(*) FROM data_cleaned_from_duration_outliers -- 179,716,634
-- SELECT * FROM data_cleaning_duration_outliers_results -- 179,716,634
-- SELECT COUNT(*) FROM data_cleaning_agg -- 89,788 (2012-12-31 to 2023-03-31)
-- SELECT * FROM data_cleaning_trips_taxis_iqr
-- SELECT COUNT(*) FROM clean_data -- 176,677,544
-- SELECT COUNT(*) FROM data_cleaning_agg_clean_data -- 89,441
-- SELECT * FROM data_cleaning_results
-- SELECT * FROM typical_trip_seconds
SELECT * FROM hourly_supply_demand ORDER BY 1
-------------------------------------------------------------------------------------------------------------------------------