-
Notifications
You must be signed in to change notification settings - Fork 1
/
Cleaning and imputing.py
168 lines (112 loc) · 7.1 KB
/
Cleaning and imputing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 11 13:04:25 2023
@author: Yousha
"""
# Source: https://www.kaggle.com/code/mateuszk013/spaceship-titanic-81-eda-ml/notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.dtypes
train_df.columns
train_df.head()
train_df.tail()
train_df.isna().any()
train_df.info()
train_df.describe(include='all')
(train_df.CryoSleep == True).sum()/len(train_df)
train_df_copy = train_df.copy()
train_df_copy['Expenses'] = train_df_copy[['RoomService', 'FoodCourt',
'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
train_df_copy.Age = train_df_copy.Age.fillna(train_df_copy.Age.median())
train_df_copy.loc[train_df_copy.Adult_spending_awake ==False, 'Age'] = 0
train_df_copy['Adult_spending_awake'] = (train_df_copy['Expenses'] > 0) & (train_df_copy['Age'] >=13) & (train_df_copy['CryoSleep'] == False)
train_df_copy['Cryosleep'] = 0
train_df_copy.loc[train_df_copy['Expenses'] == 0, 'Cryosleep'] = 1
train_df_copy.loc[train_df_copy.CryoSleep.astype('str') == 'True', 'Cryosleep'] = 1
train_df_copy.loc[train_df_copy.CryoSleep.astype('str') == 'False', 'Cryosleep'] = 0
train_df_copy['Cryosleep'] = train_df_copy['Cryosleep'].astype('bool')
train_df_copy['CryoSleep'] = train_df_copy['Cryosleep']
train_df_copy.drop('Cryosleep',axis=1,inplace=True)
train_df_copy.drop('Name',axis=1,inplace=True)
train_df_copy.loc[train_df_copy.CryoSleep == True,['RoomService', 'FoodCourt','ShoppingMall', 'Spa', 'VRDeck']] = 0
train_df_copy.loc[train_df_copy.CryoSleep == True,['RoomService', 'FoodCourt','ShoppingMall', 'Spa', 'VRDeck']].isna().sum()
train_df_copy['Age'].plot(kind='hist')
train_df_copy['Adults'] = train_df_copy['Age'] >= 13
train_df_copy['Adult_and_spending'] = (train_df_copy['Expenses'] > 0) & (train_df_copy['Age'] >=13)
train_df_copy.loc[train_df_copy.Adult_and_spending == True]
train_df_copy.RoomService = train_df_copy.RoomService.fillna(train_df_copy.RoomService.mean())
train_df_copy.loc[train_df_copy.Adult_and_spending ==False, 'RoomService'] = 0
train_df_copy.FoodCourt = train_df_copy.FoodCourt.fillna(train_df_copy.FoodCourt.mean())
train_df_copy.loc[train_df_copy.Adult_and_spending ==False, 'FoodCourt'] = 0
train_df_copy.ShoppingMall = train_df_copy.ShoppingMall.fillna(train_df_copy.ShoppingMall.mean())
train_df_copy.loc[train_df_copy.Adult_and_spending ==False, 'ShoppingMall'] = 0
train_df_copy.Spa = train_df_copy.Spa.fillna(train_df_copy.Spa.mean())
train_df_copy.loc[train_df_copy.Adult_and_spending ==False, 'Spa'] = 0
train_df_copy.VRDeck = train_df_copy.VRDeck.fillna(train_df_copy.VRDeck.mean())
train_df_copy.loc[train_df_copy.Adult_and_spending ==False, 'VRDeck'] = 0
train_df_copy.HomePlanet = train_df_copy.HomePlanet.fillna('Earth')
train_df_copy.Destination = train_df_copy.Destination.fillna('TRAPPIST-1e')
train_df_copy.VIP = train_df_copy.VIP.fillna('False')
train_df_copy.VIP = train_df_copy.VIP.astype('bool')
train_df_copy['Cabin'] = train_df_copy.Cabin.fillna(method='ffill')
train_df_copy['Group_nums'] = train_df_copy.PassengerId.apply(lambda x: x.split('_')).apply(lambda x: x[0])
train_df_copy['Grouped'] = ((train_df_copy['Group_nums'].value_counts() > 1).reindex(train_df_copy['Group_nums'])).tolist()
train_df_copy['Deck'] = train_df_copy.Cabin.apply(lambda x: str(x).split('/')).apply(lambda x: x[0])
train_df_copy['Side'] = train_df_copy.Cabin.apply(lambda x: str(x).split('/')).apply(lambda x: x[2])
train_df_copy['Has_expenses'] = train_df_copy['Expenses'] > 0
train_df_copy['Is_Embryo'] = train_df_copy['Age'] == 0
plt.bar(train_df_copy['Side'],train_df_copy['Transported'])
plt.bar(train_df_copy['Deck'],train_df_copy['Transported'])
train_df_copy.columns
train_df_copy.drop(['IsEmbryo'],axis=1, inplace=True)
train_df_copy.to_csv('Cleaned and imputed data 2.csv',index=False)
#Test Data
test_df_copy = test_df.copy()
test_df_copy['Expenses'] = test_df_copy[['RoomService', 'FoodCourt',
'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_df_copy.Age = test_df_copy.Age.fillna(test_df_copy.Age.median())
test_df_copy['Adult_spending_awake'] = (test_df_copy['Expenses'] > 0) & (test_df_copy['Age'] >=13) & (test_df_copy['CryoSleep'] == False)
test_df_copy['Cryosleep'] = 0
test_df_copy.loc[test_df_copy['Expenses'] == 0, 'Cryosleep'] = 1
test_df_copy.loc[test_df_copy.CryoSleep.astype('str') == 'True', 'Cryosleep'] = 1
test_df_copy.loc[test_df_copy.CryoSleep.astype('str') == 'False', 'Cryosleep'] = 0
test_df_copy['Cryosleep'] = test_df_copy['Cryosleep'].astype('bool')
test_df_copy['CryoSleep'] = test_df_copy['Cryosleep']
test_df_copy.drop('Cryosleep',axis=1,inplace=True)
test_df_copy.drop('Name',axis=1,inplace=True)
test_df_copy.loc[test_df_copy.CryoSleep == True,['RoomService', 'FoodCourt','ShoppingMall', 'Spa', 'VRDeck']] = 0
test_df_copy.loc[test_df_copy.CryoSleep == True,['RoomService', 'FoodCourt','ShoppingMall', 'Spa', 'VRDeck']].isna().sum()
test_df_copy['Age'].plot(kind='hist')
test_df_copy['Adults'] = test_df_copy['Age'] >= 13
test_df_copy['Adult_and_spending'] = (test_df_copy['Expenses'] > 0) & (test_df_copy['Age'] >=13)
test_df_copy.loc[test_df_copy.Adult_and_spending == True]
test_df_copy.RoomService = test_df_copy.RoomService.fillna(test_df_copy.RoomService.mean())
test_df_copy.loc[test_df_copy.Adult_and_spending ==False, 'RoomService'] = 0
test_df_copy.FoodCourt = test_df_copy.FoodCourt.fillna(test_df_copy.FoodCourt.mean())
test_df_copy.loc[test_df_copy.Adult_and_spending ==False, 'FoodCourt'] = 0
test_df_copy.ShoppingMall = test_df_copy.ShoppingMall.fillna(test_df_copy.ShoppingMall.mean())
test_df_copy.loc[test_df_copy.Adult_and_spending ==False, 'ShoppingMall'] = 0
test_df_copy.Spa = test_df_copy.Spa.fillna(test_df_copy.Spa.mean())
test_df_copy.loc[test_df_copy.Adult_and_spending ==False, 'Spa'] = 0
test_df_copy.VRDeck = test_df_copy.VRDeck.fillna(test_df_copy.VRDeck.mean())
test_df_copy.loc[test_df_copy.Adult_and_spending ==False, 'VRDeck'] = 0
test_df_copy.HomePlanet = test_df_copy.HomePlanet.fillna('Earth')
test_df_copy.Destination = test_df_copy.Destination.fillna('TRAPPIST-1e')
test_df_copy.VIP = test_df_copy.VIP.fillna('False')
test_df_copy.VIP = test_df_copy.VIP.astype('bool')
test_df_copy['Cabin'] = test_df_copy.Cabin.fillna(method='ffill')
test_df_copy['Group_nums'] = test_df_copy.PassengerId.apply(lambda x: x.split('_')).apply(lambda x: x[0])
test_df_copy['Grouped'] = ((test_df_copy['Group_nums'].value_counts() > 1).reindex(test_df_copy['Group_nums'])).tolist()
test_df_copy['Deck'] = test_df_copy.Cabin.apply(lambda x: str(x).split('/')).apply(lambda x: x[0])
test_df_copy['Side'] = test_df_copy.Cabin.apply(lambda x: str(x).split('/')).apply(lambda x: x[2])
test_df_copy['Has_expenses'] = test_df_copy['Expenses'] > 0
test_df_copy['Is_Embryo'] = test_df_copy['Age'] == 0
test_df_copy.columns
test_df_copy.drop(['Expenses', 'Adult_spending_awake', 'Adult_and_spending','Adults'],axis=1, inplace=True)
test_df_copy.to_csv('Cleaned and imputed test data.csv',index=False)