-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.py
141 lines (112 loc) · 4.67 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
A model for predicting whether or not a spell in the Harry Potter book series was used in the seventh book.
This project was inspired by Code2040's 2017 Fellow application. The datasets on spells was provided by Code2040 (in two forms: spells & mentions) and modified by me to fit the training method used (Linear Regression).
The datasets were preprocessed in Ruby (I have reasons why. The code will be transformed to Python in the future.) The datasets has the following features:
Classification -> [charm, curse, jinx, spell]: Classification type of each spell.
Consequence -> int: Total influence of a spell, measured by the difference between the sentiment scores of mentions involving the spell i.e. score of mention w/o spell and w/ spell (the Effect is inserted instead of the spell name)
Sentiment -> int: Sentiment score of the effect of each spell.
Count -> int: Number of times each spell was used in books 1 - 6.
Appearance -> int: whether or not a spell was used in book 7.
Olu Gbadebo
Oct. 1, 2017
"""
import pandas as pd
import tensorflow as tf
from tensorflow.python import debug as tf_debug
import csv
import tempfile
import argparse
import sys
import time
training_data = pd.read_json('training_data.json', orient='records')
testing_data = pd.read_json('testing_data.json', orient='records')
train_labels = training_data["Appearance"]
test_labels = testing_data["Appearance"]
# feature columns
classification = tf.feature_column.categorical_column_with_vocabulary_list(
"Classification", ["Charm", "Curse", "Jinx", "Spell"])
spell = tf.feature_column.categorical_column_with_hash_bucket(
"Spell", hash_bucket_size=100)
sentiment = tf.feature_column.numeric_column("Sentiment")
sentiment_buckets = tf.feature_column.bucketized_column(
sentiment, boundaries=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
count = tf.feature_column.numeric_column("Count")
consequence = tf.feature_column.numeric_column("Consequence")
# TODO: calc stddev for each feature
# Kernel mappers
classification_km = tf.contrib.kernel_methods.RandomFourierFeatureMapper(
input_dim=4, output_dim=100, stddev=5.0, name='rffm')
spell_km = tf.contrib.kernel_methods.RandomFourierFeatureMapper(
input_dim=92, output_dim=1000, stddev=5.0, name='rffm')
kernel_mappers = {
classification: [classification_km],
spell: [spell_km]
}
# sparse column to compensate for insufficient data
sparse_classification = tf.contrib.layers.sparse_column_with_hash_bucket("Classification", 4)
sparse_spell = tf.contrib.layers.sparse_column_with_hash_bucket("Spell", 100)
sparse_count = tf.contrib.layers.sparse_column_with_integerized_feature("Count", 1000)
base_columns = [
sparse_classification, sparse_spell, sentiment_buckets, sparse_count, consequence
]
crossed_columns = [
tf.feature_column.crossed_column(
["Sentiment", "Consequence"],
hash_bucket_size=1000)
]
def input_fn(data, num_epochs, shuffle):
"""Input builder function."""
data.dropna(how="any", axis=0)
labels = data["Appearance"]
return tf.estimator.inputs.pandas_input_fn(
x=data,
y=labels,
batch_size=1,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=1)
# Uncomment next line for debugging in CLI
# hooks = [tf_debug.LocalCLIDebugHook()]
def build_estimator(model_dir):
"""Build an estimator."""
return tf.contrib.kernel_methods.KernelLinearClassifier(
feature_columns=base_columns + crossed_columns,
model_dir=model_dir,
optimizer=tf.train.FtrlOptimizer(
learning_rate=0.0001,
l2_regularization_strength=0.0001)
)
def train_and_eval(model_dir, train_steps):
"""Train and evaluate the model."""
model_dir = tempfile.mkdtemp() if not model_dir else model_dir
m = build_estimator(model_dir)
# add 'hooks = hooks' to debug training process
m.fit(
input_fn=input_fn(training_data, num_epochs=None, shuffle=True),
steps=train_steps)
m.evaluate(
input_fn=input_fn(testing_data, num_epochs=1, shuffle=False),
steps=None)
prediction = m.predict_classes(
input_fn=input_fn(testing_data, num_epochs=1, shuffle=False))
print(list(prediction))
FLAGS = None
def main(_):
train_and_eval(FLAGS.model_dir, FLAGS.train_steps)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--model_dir",
type=str,
default="",
help="Base directory for output models."
)
parser.add_argument(
"--train_steps",
type=int,
default=2000,
help="Number of training steps."
)
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)