-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPandas_MicroCodes.py
103 lines (66 loc) · 3.06 KB
/
Pandas_MicroCodes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#Micro codes for checking the Pandas
# Read CSV in chunks
data_read = pd.read_csv('Filename.csv',chunksize=x,encoding = "any encoding format for eg :ISO-8859-1")
data = pd.concat(data_read,ignore_index=True)
# Split the dataframe based on the numeric and categorical values
# Numeric Split
cleansed_data_numeric = data.select_dtypes(include=['number']).columns
data_read_numeric = data.loc[:,lambda data : cleansed_data_numeric]
numeric_null_count = data_read_numeric.apply(lambda x : sum(x.notnull()))
# Categorical Split
cleansed_data_category = data.select_dtypes(exclude=['number']).columns
data_read_category = data.loc[:,lambda data : cleansed_data_category]
categorical_null_count = data_read_category.apply(lambda x : sum(x.notnull()))
# Date Difference and Date Conversion Logic
import datetime as DT
data['Date'].dtype
#pd.to_datetime(data['Date'])
now = pd.Timestamp(DT.datetime.now())
data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y')
# Difference
data['DOB_NEW'] =data['DOB_NEW'].where(data['DOB_NEW'] < now, data['DOB_NEW'] - np.timedelta64(100, 'Y')) # 2
data['Age_Driver1'] = (now - data['DOB_NEW']).astype('<m8[Y]') # 3
# Copying chunks of data from one frame to another
# data=original frame ptsdriver2 = copied frame
ptsdriver2 = data[['ViolPoints2Driver_2',
'ViolPoints1Driver_2',
'ViolPoints3Driver_2',
'ViolPoints4Driver_2',
'ViolPoints5Driver_2',
'ViolPoints6Driver_2',
'ViolPoints7Driver_2',
'ViolPoints8Driver_2',
]].copy()
# Sum of values in the frame row-wise
ptsdriver2['Points_Driver2'] = ptsdriver2.apply(lambda x : x.sum(),axis=1)
# Replace Blank values with NaN
dataframe.replace(r'^\s*$', np.NaN, regex=True, inplace = True)
# Scaling the values to remove if you want to treat numeric values on same scale
# Libraries
from sklearn.preprocessing import StandardScaler
#Step1 : Create ScaleClean using
#Example :
ScaleClean = StandardScaler().fit(data[['Zip']])
#Step2 : Create Clean transform
#Example :
CleanTransform = ScaleClean.transform(data[['Zip']])
#Step3 : Create dataframe of the Clean Transform
#Example : clean = pd.DataFrame(CleanTransform)
#Step4 : Join/Concatenate the Frames
frames = [data,clean]
data = pd.concat(frames,axis=1)
#Step4 : Drop the Original Columns and Rename the New Ones
data = data.drop(['Zip'],1)
data = data.rename(columns={0:'Zip'})
#Copying the data in a different frame based on column value condition
datax = data[data['Gender']=='M']
# Striptime and Date Difference in days
data['DiscoveryDate']= pd.DatetimeIndex(data['Discovery Date']).date
data['OccurrenceStartDate']= pd.DatetimeIndex(data['Occurrence Start Date']).date
data['Occurrence_Discovery_Diff']= (data['OccurrenceStartDate'] - data['DiscoveryDate'])/np.timedelta64(1, 'D')
# Remove negative values from columns
dat_pos1 =data_numeric[(data_numeric[
['Net Loss','Recovery Amount',
'Estimated Gross Loss',
'Recovery Amount (percent)',
'Occurrence_Discovery_Diff']] > 0).all(1)]