-
Notifications
You must be signed in to change notification settings - Fork 1
/
label_and_onehot_encode.py
201 lines (160 loc) · 8.03 KB
/
label_and_onehot_encode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np
# Import card dataframe
card_df = pd.read_pickle("data\card_df.pkl")
# Verify length of card dataframe
print(len(card_df))
print(card_df.head())
# Prepare values for encoding
card_df['hp'] = card_df['hp'].astype(int)
card_df['convertedRetreatCost'] = card_df['convertedRetreatCost'].fillna(0)
card_df['convertedRetreatCost'] = card_df['convertedRetreatCost'].astype(int)
card_df['name'] = card_df['name'].astype(str)
card_df['subtypes'] = [str(i) for i in card_df['subtypes']]
card_df['subtypes'] = card_df['subtypes'].astype(str)
card_df['rules'] = card_df['rules'].astype(str)
card_df['types'] = card_df['types'].astype(str)
# create new columns for each element of an attack
card_df['attack_name_1'] = None
card_df['attack_name_2'] = None
card_df['attack_name_3'] = None
card_df['attack_name_4'] = None
card_df['attack_text_1'] = None
card_df['attack_text_2'] = None
card_df['attack_text_3'] = None
card_df['attack_text_4'] = None
card_df['attack_damage_1'] = None
card_df['attack_damage_2'] = None
card_df['attack_damage_3'] = None
card_df['attack_damage_4'] = None
card_df['attack_convertedEnergyCost_1'] = None
card_df['attack_convertedEnergyCost_2'] = None
card_df['attack_convertedEnergyCost_3'] = None
card_df['attack_convertedEnergyCost_4'] = None
for i, row in card_df.iterrows():
attacks = row['attacks']
if attacks:
# loop over each attack
for j in range(len(attacks)):
attack = attacks[j]
card_df.at[i, 'attack_name_{}'.format(j + 1)] = attack.name
card_df.at[i, 'attack_text_{}'.format(j + 1)] = attack.text
card_df.at[i, 'attack_damage_{}'.format(j + 1)] = attack.damage
card_df.at[i, 'attack_convertedEnergyCost_{}'.format(j + 1)] = attack.convertedEnergyCost
else:
# set attack information to None or empty string
for j in range(4):
card_df.at[i, 'attack_name_{}'.format(j + 1)] = None
card_df.at[i, 'attack_text_{}'.format(j + 1)] = None
card_df.at[i, 'attack_damage_{}'.format(j + 1)] = None
card_df.at[i, 'attack_convertedEnergyCost_{}'.format(j + 1)] = None
# Drop unnecesary attacks column
card_df = card_df.drop('attacks', axis=1)
card_df['weaknesses'] = [str(i) for i in card_df['weaknesses']]
card_df['weaknesses'] = card_df['weaknesses'].astype(str)
card_df['evolvesFrom'] = card_df['evolvesFrom'].astype(str)
# Store the label encoders used
label_encoders = {}
attack_damage_encoder = LabelEncoder()
attack_convertedEnergyCost_encoder = LabelEncoder()
label_encoders['attack_damage'] = attack_damage_encoder
label_encoders['attack_convertedEnergyCost'] = attack_convertedEnergyCost_encoder
# Store label encoded data
le_card_df = pd.DataFrame()
print(card_df.columns)
# Label encode data
for column in ['name', 'types', 'hp', 'weaknesses', 'convertedRetreatCost', 'evolvesFrom']:
label_encoder = LabelEncoder()
le_card_df[column] = label_encoder.fit_transform(card_df[column].astype(str))
label_encoders[column] = label_encoder
# Label encode attacks and store them in the label encoded card dictionary
le_card_df['attack_damage'] = attack_damage_encoder.fit_transform(card_df['attack_damage_1'])
le_card_df['attack_convertedEnergyCost'] = attack_convertedEnergyCost_encoder.fit_transform(card_df['attack_convertedEnergyCost_1'])
# Create a dataframe to store the label decoding dictionaries
label_decoding_df = pd.DataFrame(columns=['column_name', 'encoding', 'decoding'])
# Loop through each column's label encoder and store the label decoding dictionary
for column, label_encoder in label_encoders.items():
decoding = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
temp_df = pd.DataFrame({'column_name': [column] * len(decoding), 'encoding': list(decoding.keys()), 'decoding': list(decoding.values())})
label_decoding_df = pd.concat([label_decoding_df, temp_df], ignore_index=True)
# Store the onehot encoders used
onehot_encoders = {}
# Store onehot encoded data
oh_card_dict = {}
# Create an initial decoding dictionary
oh_decoding_dict = {}
# Onehot encode data
for column in ['subtypes', 'rules', 'legalities']:
onehot_encoder = OneHotEncoder()
encoded_data = onehot_encoder.fit_transform(card_df[column].astype(str).values.reshape(-1, 1))
oh_card_dict[column] = encoded_data.toarray()
onehot_encoders[column] = onehot_encoder
# Store the inverse transformation
oh_decoding_dict[column] = {}
for i, category in enumerate(onehot_encoder.categories_[0]):
oh_decoding_dict[column][i] = category
# Store the onehot encoders used for attacks
attack_name_encoder = OneHotEncoder()
attack_text_encoder = OneHotEncoder()
onehot_encoders['attack_name'] = attack_name_encoder
oh_decoding_dict['attack_name'] = {}
for i, category in enumerate(onehot_encoder.categories_[0]):
oh_decoding_dict['attack_name'][i] = category
onehot_encoders['attack_text'] = attack_text_encoder
oh_decoding_dict['attack_text'] = {}
for i, category in enumerate(onehot_encoder.categories_[0]):
oh_decoding_dict['attack_text'][i] = category
card_df.fillna('NA')
# Prepare name and text for label one hot encoding
first_attack_names = card_df['attack_name_1']
first_attack_texts = card_df['attack_text_1']
names_values = first_attack_names.astype(str).values.reshape(-1,1)
texts_values = first_attack_texts.astype(str).values.reshape(-1,1)
# Onehot encode attacks
attack_encoded_name_data = attack_name_encoder.fit_transform(names_values)
attack_encoded_text_data = attack_text_encoder.fit_transform(texts_values)
# Store encoded data in dictionary
oh_card_dict['attack_name'] = attack_encoded_name_data.toarray()
oh_card_dict['attack_text'] = attack_encoded_text_data.toarray()
# Convert arrays to lists
oh_card_dict = {key: value.tolist() for key, value in oh_card_dict.items()}
# Concatenate encoded data into a single dataframe
oh_card_df = pd.DataFrame.from_dict(oh_card_dict)
# Create decoding dictionary by inverting the encoding using
# inverse_transform method of each OneHotEncoder object
oh_decoding_dict = {}
for column, onehot_encoder in onehot_encoders.items():
oh_decoding_dict[column] = {}
for i, category in enumerate(onehot_encoder.categories_[0]):
oh_decoding_dict[column][i] = category
pd.to_pickle(le_card_df, 'test_data\label_encoded_df.pkl')
pd.to_pickle(oh_card_df, 'test_data\oh_encoded_df.pkl')
pd.to_pickle(label_encoders, 'test_data\label_encoders.pkl')
pd.to_pickle(oh_decoding_dict, 'test_data\oh_decoding_dict.pkl')
print(le_card_df.head())
print(oh_card_df.head())
#print(label_decoding_df)
#print(oh_decoding_dict)
# Decode the label encoded data
for column, label_encoder in label_encoders.items():
print(column)
le_card_df[column] = le_card_df[column].map(dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_)))
# Decode the onehot encoded data
for column in ['subtypes', 'rules', 'legalities', 'attack_name', 'attack_text']:
decoding_dict = oh_decoding_dict[column]
decoded_data = []
for i in range(len(oh_card_df)):
row = oh_card_df.iloc[i][column]
decoded_values = [decoding_dict[j] for j, val in enumerate(row) if val == 1]
decoded_data.append(', '.join(decoded_values))
oh_card_df[column] = decoded_data
# Combine the label encoded and onehot encoded data into a single dataframe
decoded_card_df = le_card_df.join(oh_card_df[['subtypes', 'rules', 'legalities', 'attack_name', 'attack_text']])
# Reorder columns
decoded_card_df = decoded_card_df[['name', 'subtypes', 'rules', 'types', 'hp', 'weaknesses', 'convertedRetreatCost', 'legalities', 'evolvesFrom', 'attack_name', 'attack_text', 'attack_damage', 'attack_convertedEnergyCost']]
# Display the decoded data
print(decoded_card_df.head())
print(card_df[['name', 'subtypes', 'rules', 'types', 'hp', 'weaknesses', 'convertedRetreatCost', 'legalities', 'evolvesFrom', 'attack_name_1', 'attack_text_1', 'attack_damage_1', 'attack_convertedEnergyCost_1']].head())
decoded_card_df.to_csv("decoded_card_df.csv")