-
Notifications
You must be signed in to change notification settings - Fork 1
/
module_generate_dataset.py
363 lines (312 loc) · 23.5 KB
/
module_generate_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
"""
The purpose of this module is to:
* generate datasets that can be used for the training and testing the model
Note: The split in train and test set before feature engineering is essential since bias in the features are reduced. For instance, the later training set only contains characteristics of the data set that was subsetted for training purposes. We aim to make the model more generizable with this.
Prerequisite:
* The following datasets should be downloaded/generated and named:
* download: baskets.parquet, coupons.parquet
* create: df_negative_samples.parquet, product_categories.csv, avg_no_weeks_between_two_purchases.parquet, lags.parquet, purchase_temporal_distribution.parquet
* Table of Content:
Train+Test
* Load Data Sets
* Merge Data Sets
* Feature Engineering Part I
* Unit Test Block I
* Train-Test-Split
* Clear Memory
Train
* Feature Engineering Part II.a
* Clear Memory
* Feature Engineering Part III.a
* Imputing/Fixing Missing Values
* Unit Test Block II.a
* Store data_train
Test
* Feature Engineering Part II.b
* Clear Memory
* Feature Engineering Part III.b
* Imputing/Fixing Missing Values
* Unit Test Block II.b
* Store data_test
Train+Test
* Unit Test Block III
"""
import pandas as pd
import numpy as np
def generate_dataset(path, train_start, train_end, test_start, test_end):
"""
input:
path: path where data sets are stored -> outputted data sets are saved as train_s2000_final.parquet and test_s2000_final.parquet in the pwd
train_start: first week that the training set should start with
train_end: the last week the training set should end with
test_start: analogue to train
test_end: analogue to train
output:
data_train: engineered training set
data_test: engineered testing set
"""
print('The dataframes should be named: \nbaskets.parquet, \ncoupons.parquet, \ndf_negative_samples.parquet, \nproduct_categories.csv, \navg_no_weeks_between_two_purchases.parquet, \nlags.parquet and \npurchase_temporal_distribution.parquet')
'Load Data Sets'
#customers past purchase (week 0-89, shopper, product, price in € cents)
basket_df = pd.read_parquet(path + '/baskets.parquet')
#coupons customers received in the past (week, shopper, product, discount in %)
coupon_df = pd.read_parquet(path + '/coupons.parquet')
#negative samples generate randomly from the customer preference with the length i of the corresponding week j
negative_sample_df = pd.read_parquet(path + '/df_negative_samples.parquet')
#categories among products that were created with TSNE
categories = pd.read_csv(path + '/product_categories.csv', sep = ',')
#average number of week that passed between two purchases
avg_no_weeks_between_two_purchases = pd.read_parquet(path + '/avg_no_weeks_between_two_purchases.parquet')
#time that has passed since the shopper i bought product j the last time
lags = pd.read_parquet(path + '/lags.parquet')
lags = lags.drop('product_bought', axis=1)
lags = lags[lags['week'] <= 89]
#distribution of purchases, e.g whether they occurs frequently or rather in the first/last half of the timeseries
purchase_temporal_distribution = pd.read_parquet(path + '/purchase_temporal_distribution.parquet')
#shrink dataframe to only the 2000 shopper that we need to make predictions for
basket_df = basket_df[(basket_df['shopper'] < 2000)]
coupon_df = coupon_df[(coupon_df['shopper'] < 2000)]
'Merge Data Sets'
data = pd.merge(basket_df, coupon_df, on=['week', 'shopper', 'product'], how='outer')
data = pd.merge(data, negative_sample_df, on=['week', 'shopper', 'product'], how='outer')
data = pd.merge(data, categories, on=['product'], how='left')
data = pd.merge(data, avg_no_weeks_between_two_purchases, on=['shopper', 'product'], how='left')
data = pd.merge(data, lags, on=['shopper', 'product', 'week'], how='left')
data = pd.merge(data, purchase_temporal_distribution, on=['shopper', 'product'], how='left')
'Feature Engineering Part I'
#missing values are imputed with -1 (=never bought) since there missing not differentiates from the pure absense (0)
data['avg_no_weeks_between_two_purchases'] = data['avg_no_weeks_between_two_purchases'].replace(np.nan, -1)
data['purchase_temporal_distribution'] = data['purchase_temporal_distribution'].replace(np.nan, -1)
data['lag_weeks_of_product_per_customer'] = data['lag_weeks_of_product_per_customer'].replace(np.nan, -1)
#replace NaN value of the col discount with 0 aka no discount
data['discount'] = data['discount'].replace(np.nan, 0)
#for now replace missing prices with 0, later we will adjust them by the price with the corresponding discount
data['price'] = data['price'].replace(np.nan, 0)
#discount offered to the shopper
data['discount_offered'] = np.where(data['discount'] != 0, 1, 0)
#product purchased
data['product_bought'] = np.where(data['price'] != 0, 1, 0)
#purchase without having a discount
data['purchase_w/o_dis'] = np.where(((data['product_bought'] == 1) & (data['discount_offered'] == 0)), 1, 0)
#no purchase even though a discount was offered
data['no_purchase_w_dis'] = np.where(((data['product_bought'] == 0) & (data['discount_offered'] == 1)), 1, 0)
#discount effect --> either neutral/negative (if shopper would have bought the item anyways, eventually market lost revenue) or positive
data['discount_effect'] = np.where(((data.discount_offered == 1) & (data.product_bought == 1)), 1, 0)
'Unit Test Block I'
assert data['shopper'].nunique() == 2000
assert data['discount_offered'].nunique() == 2
assert data['product_bought'].nunique() == 2
assert data['purchase_w/o_dis'].nunique() == 2
assert data['no_purchase_w_dis'].nunique() == 2
assert data['discount_effect'].nunique() == 2
assert data.isna().sum().sum() == 0
'Train-Test-Split'
data_train = data[((data['week'] >= train_start) & (data['week'] <= train_end))]
data_test = data[((data['week'] >= test_start) & (data['week'] <= test_end))]
'Clear Memory'
del coupon_df
del negative_sample_df
del categories
del data
'Feature Engineering Part II.a'
#maximal price of product
max_price = data_train.groupby('product')['price'].agg(max).reset_index()
max_price = max_price.rename(columns = {'price': 'max_price'})
#merge max price to the df
data_train = pd.merge(data_train, max_price, on='product', how='left')
#impute missing prices by the max price minues the offered discount (because this was the price the shoppers was offered);
#for the missing prices of the negative sample df it will automatically insert the max_price since discount is 0
data_train['price'] = np.where(data_train['price'] == 0, data_train['max_price'] * (1 - data_train['discount'] / 100), data_train['price'])
#minimal price of product; we need to take the minimal price of the bought products; thus, from the basket_df; otherwise, the min_price will also be 0 since we imputed the NaNs with 0 before
min_price = basket_df.groupby('product')['price'].agg(min).reset_index()
min_price = min_price.rename(columns={'price': 'min_price'})
#merge min price to the df
data_train = pd.merge(data_train, min_price, on='product', how='left')
'Clear Memory'
del max_price
del min_price
'Feature Engineering Part III.a'
#CUSTOMER DIMENSION
#no_products_bought: number products bought by a customer i
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['shopper'])['product'].agg('count').reset_index().rename(columns={'product': 'no_products_bought'})
data_train = pd.merge(data_train, tmp, on='shopper', how='left')
#spend_per_customer: Customer Lifetime Value (sum € spend by a customer i
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['shopper'])['price'].agg('sum').reset_index().rename(columns={'price': 'spend_per_customer'})
data_train = pd.merge(data_train, tmp, on='shopper', how='left')
#no_unique_products: number unique products bought by customer i
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['shopper'])['product'].agg('nunique').reset_index().rename(columns={'product': 'no_unique_products'})
data_train = pd.merge(data_train, tmp, on='shopper', how='left')
#discount_purchase: number products bought at discount by a customer i
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['shopper'])['discount_offered'].agg('sum').reset_index().rename(columns={'discount_offered': 'discount_purchase'})
data_train = pd.merge(data_train, tmp, on='shopper', how='left')
#--------------------------
#PRODUCT DIMENSION
#product_sells: number of times the product was sold
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['product'])['price'].agg('count').reset_index().rename(columns={'price': 'product_sells'})
data_train = pd.merge(data_train, tmp, on='product', how='left')
#product_dis_sells: number of times a product was bought with a discount
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['product'])['discount_offered'].agg('sum').reset_index().rename(columns={'discount_offered': 'product_dis_sells'})
data_train = pd.merge(data_train, tmp, on='product', how='left')
#product_dis_sells_share
data_train['product_dis_sells_share'] = data_train['product_dis_sells'] / data_train['product_sells']
#--------------------------
#CUSTOMER X PRODUCT DIMENSION
#no_products_bought_per_product: no product j purchases for customer i
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['shopper', 'product'])['price'].agg('count').reset_index().rename(columns={'price': 'no_products_bought_per_product'})
data_train = pd.merge(data_train, tmp, on=['shopper', 'product'], how='left')
#customer_prod_dis_purchases: number purchases of a product j at discount by customer i
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['shopper', 'product'])['discount_effect'].agg('sum').reset_index().rename(columns={'discount_effect': 'customer_prod_dis_purchases'})
data_train = pd.merge(data_train, tmp, on=['shopper', 'product'], how='left')
#customer_prod_bought_dis_share: share a product j is bought at a discount by customer i
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['shopper', 'product'])['discount_effect'].agg('mean').reset_index().rename(columns={'discount_effect': 'customer_prod_bought_dis_share'})
data_train = pd.merge(data_train, tmp, on=['shopper', 'product'], how='left')
#customer_prod_dis_offers: number discount offers of a product j for customer i
tmp = data_train.groupby(['shopper', 'product'])['discount_offered'].agg('count').reset_index().rename(columns={'discount_offered': 'customer_prod_dis_offers'})
data_train = pd.merge(data_train, tmp, on=['shopper', 'product'], how='left')
#customer_prod_dis_offered_share: share of deemed coupons per customer i
tmp = data_train[(data_train['discount_offered'] == 1)].groupby(['shopper', 'product'])['product_bought'].agg('mean').reset_index().rename(columns={'product_bought': 'customer_prod_dis_offered_share'})
data_train = pd.merge(data_train, tmp, on=['shopper', 'product'], how='left')
#customer_product_share: share of product j was bought by customer i in comparison to all other products that were bought by customer i
data_train['customer_product_share'] = data_train['no_products_bought_per_product'] / data_train['no_products_bought']
#customer_mean_product_price: average price of an item bought by a customer i
data_train['customer_mean_product_price'] = data_train['spend_per_customer'] / data_train['no_products_bought']
#customer_discount_buy_share: the percentage of products bought at discount by customer i
data_train['customer_discount_buy_share'] = data_train['discount_purchase'] / data_train['no_products_bought']
#--------------------------
#WEEK X CUSTOMER DIMENSION
#week_basket_size: number products bought by a customer i in week t
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['week', 'shopper'])['product'].agg('count').reset_index().rename(columns={'product': 'week_basket_size'})
data_train = pd.merge(data_train, tmp, on=['week', 'shopper'], how='left')
#week_basket_value: sum products in € by a customer i in week t
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['week', 'shopper'])['price'].agg('sum').reset_index().rename(columns={'price': 'week_basket_value'})
data_train = pd.merge(data_train, tmp, on=['week', 'shopper'], how='left')
#--------------------------
#CUSTOMER DIMENSION
#mean_basket_size: the average basket size of customer i
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['shopper'])['week_basket_size'].agg('mean').reset_index().rename(columns={'week_basket_size': 'mean_basket_size'})
data_train = pd.merge(data_train, tmp, on='shopper', how='left')
#mean_basket_value: the average basket value in € of customer i
tmp = data_train[(data_train['product_bought'] == 1)].groupby(['shopper'])['week_basket_value'].agg('mean').reset_index().rename(columns={'week_basket_value': 'mean_basket_value'})
data_train = pd.merge(data_train, tmp, on='shopper', how='left')
'Imputing/Fixing Missing Values'
#30 shoppers data the first time in week 1; therefore there are no data for week 0
data_train = data_train[data_train['week_basket_size'].notna()]
#some shopper never bought a product, even not when it was e.g. discounted; this is valuabe information, therefore, NaNs are set to -1
data_train['customer_product_share'] = data_train['customer_product_share'].replace(np.nan, -1)
data_train['no_products_bought_per_product'] = data_train['no_products_bought_per_product'].replace(np.nan, -1)
data_train['customer_prod_dis_purchases'] = data_train['customer_prod_dis_purchases'].replace(np.nan, -1)
data_train['customer_prod_bought_dis_share'] = data_train['customer_prod_bought_dis_share'].replace(np.nan, -1)
data_train['customer_prod_dis_offered_share'] = data_train['customer_prod_dis_offered_share'].replace(np.nan, -1)
'Unit Test Block II.a'
assert len(list(data_train.columns)) == 35
assert data_train.isna().sum().sum() == 0
assert data_train['product_bought'].nunique() == 2
'Store data_train'
data_train.sort_values(by=['week', 'shopper', 'product'], inplace=True)
data_train.to_parquet(path + '/train_s2000_final.parquet')
'Feature Engineering Part II.b'
#maximal price of product
max_price = data_test.groupby('product')['price'].agg(max).reset_index()
max_price = max_price.rename(columns = {'price': 'max_price'})
#merge max price to the df
data_test = pd.merge(data_test, max_price, on='product', how='left')
#impute missing prices by the max price minues the offered discount (because this was the price the shoppers was offered);
#for the missing prices of the negative sample df it will automatically insert the max_price since discount is 0
data_test['price'] = np.where(data_test['price'] == 0, data_test['max_price'] * (1 - data_test['discount'] / 100), data_test['price'])
#minimal price of product; we need to take the minimal price of the bought products; thus, from the basket_df; otherwise, the min_price will also be 0 since we imputed the NaNs with 0 before
min_price = basket_df.groupby('product')['price'].agg(min).reset_index()
min_price = min_price.rename(columns={'price': 'min_price'})
#merge min price to the df
data_test = pd.merge(data_test, min_price, on='product', how='left')
'Clear Memory'
del basket_df
del max_price
del min_price
'Feature Engineering Part III.b'
#CUSTOMER DIMENSION
#no_products_bought: number products bought by a customer i
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['shopper'])['product'].agg('count').reset_index().rename(columns={'product': 'no_products_bought'})
data_test = pd.merge(data_test, tmp, on='shopper', how='left')
#spend_per_customer: Customer Lifetime Value (sum € spend by a customer i
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['shopper'])['price'].agg('sum').reset_index().rename(columns={'price': 'spend_per_customer'})
data_test = pd.merge(data_test, tmp, on='shopper', how='left')
#no_unique_products: number unique products bought by customer i
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['shopper'])['product'].agg('nunique').reset_index().rename(columns={'product': 'no_unique_products'})
data_test = pd.merge(data_test, tmp, on='shopper', how='left')
#discount_purchase: number products bought at discount by a customer i
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['shopper'])['discount_offered'].agg('sum').reset_index().rename(columns={'discount_offered': 'discount_purchase'})
data_test = pd.merge(data_test, tmp, on='shopper', how='left')
#--------------------------
#PRODUCT DIMENSION
#product_sells: number of times the product was sold
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['product'])['price'].agg('count').reset_index().rename(columns={'price': 'product_sells'})
data_test = pd.merge(data_test, tmp, on='product', how='left')
#product_dis_sells: number of times a product was bought with a discount
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['product'])['discount_offered'].agg('sum').reset_index().rename(columns={'discount_offered': 'product_dis_sells'})
data_test = pd.merge(data_test, tmp, on='product', how='left')
#product_dis_sells_share
data_test['product_dis_sells_share'] = data_test['product_dis_sells'] / data_test['product_sells']
#--------------------------
#CUSTOMER X PRODUCT DIMENSION
#no_products_bought_per_product: no product j purchases for customer i
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['shopper', 'product'])['price'].agg('count').reset_index().rename(columns={'price': 'no_products_bought_per_product'})
data_test = pd.merge(data_test, tmp, on=['shopper', 'product'], how='left')
#customer_prod_dis_purchases: number purchases of a product j at discount by customer i
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['shopper', 'product'])['discount_effect'].agg('sum').reset_index().rename(columns={'discount_effect': 'customer_prod_dis_purchases'})
data_test = pd.merge(data_test, tmp, on=['shopper', 'product'], how='left')
#customer_prod_bought_dis_share: share a product j is bought at a discount by customer i
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['shopper', 'product'])['discount_effect'].agg('mean').reset_index().rename(columns={'discount_effect': 'customer_prod_bought_dis_share'})
data_test = pd.merge(data_test, tmp, on=['shopper', 'product'], how='left')
#customer_prod_dis_offers: number discount offers of a product j for customer i
tmp = data_test.groupby(['shopper', 'product'])['discount_offered'].agg('count').reset_index().rename(columns={'discount_offered': 'customer_prod_dis_offers'})
data_test = pd.merge(data_test, tmp, on=['shopper', 'product'], how='left')
#customer_prod_dis_offered_share: share of deemed coupons per customer i
tmp = data_test[(data_test['discount_offered'] == 1)].groupby(['shopper', 'product'])['product_bought'].agg('mean').reset_index().rename(columns={'product_bought': 'customer_prod_dis_offered_share'})
data_test = pd.merge(data_test, tmp, on=['shopper', 'product'], how='left')
#customer_product_share: share of product j was bought by customer i in comparison to all other products that were bought by customer i
data_test['customer_product_share'] = data_test['no_products_bought_per_product'] / data_test['no_products_bought']
#customer_mean_product_price: average price of an item bought by a customer i
data_test['customer_mean_product_price'] = data_test['spend_per_customer'] / data_test['no_products_bought']
#customer_discount_buy_share: the percentage of products bought at discount by customer i
data_test['customer_discount_buy_share'] = data_test['discount_purchase'] / data_test['no_products_bought']
#--------------------------
#WEEK X CUSTOMER DIMENSION
#week_basket_size: number products bought by a customer i in week t
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['week', 'shopper'])['product'].agg('count').reset_index().rename(columns={'product': 'week_basket_size'})
data_test = pd.merge(data_test, tmp, on=['week', 'shopper'], how='left')
#week_basket_value: sum products in € by a customer i in week t
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['week', 'shopper'])['price'].agg('sum').reset_index().rename(columns={'price': 'week_basket_value'})
data_test = pd.merge(data_test, tmp, on=['week', 'shopper'], how='left')
#--------------------------
#CUSTOMER DIMENSION
#mean_basket_size: the average basket size of customer i
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['shopper'])['week_basket_size'].agg('mean').reset_index().rename(columns={'week_basket_size': 'mean_basket_size'})
data_test = pd.merge(data_test, tmp, on='shopper', how='left')
#mean_basket_value: the average basket value in € of customer i
tmp = data_test[(data_test['product_bought'] == 1)].groupby(['shopper'])['week_basket_value'].agg('mean').reset_index().rename(columns={'week_basket_value': 'mean_basket_value'})
data_test = pd.merge(data_test, tmp, on='shopper', how='left')
'Imputing/Fixing Missing Values'
#30 shoppers data the first time in week 1; therefore there are no data for week 0
data_test = data_test[data_test['week_basket_size'].notna()]
#some shopper never bought a product, even not when it was e.g. discounted; this is valuabe information, therefore, NaNs are set to -1
data_test['customer_product_share'] = data_test['customer_product_share'].replace(np.nan, -1)
data_test['no_products_bought_per_product'] = data_test['no_products_bought_per_product'].replace(np.nan, -1)
data_test['customer_prod_dis_purchases'] = data_test['customer_prod_dis_purchases'].replace(np.nan, -1)
data_test['customer_prod_bought_dis_share'] = data_test['customer_prod_bought_dis_share'].replace(np.nan, -1)
data_test['customer_prod_dis_offered_share'] = data_test['customer_prod_dis_offered_share'].replace(np.nan, -1)
'Unit Test Block II.b'
assert len(list(data_test.columns)) == 35
assert data_test.isna().sum().sum() == 0
assert data_test['product_bought'].nunique() == 2
'Store data_test'
data_test.sort_values(by=['week', 'shopper', 'product'], inplace = True)
data_test.to_parquet(path + '/test_s2000_final.parquet')
'Unit Test Block III'
assert min(data_train['week']) == train_start
assert max(data_train['week']) == train_end
assert min(data_test['week']) == test_start
assert max(data_test['week']) == test_end
print('\nData sets (train and test) are generated and saved as a parquet file to: ' + path)
data_train = data_train.reset_index(drop = True)
data_test = data_test.reset_index(drop = True)
return (data_train, data_test)