Skip to content

Commit

Permalink
Use all 3 datasets to build the vocabularies (#15)
Browse files Browse the repository at this point in the history
Replaced hard cast to float32 which caused reloading of model to fail
  • Loading branch information
kylase authored Jan 26, 2019
1 parent dca0646 commit c7a2557
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 12 deletions.
2 changes: 2 additions & 0 deletions app/resources/parscit.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def post(self):
data = prepare_dataset([[[token] for token in tokens]],
current_app.word_to_id,
current_app.char_to_id,
{},
current_app.model.parameters['lower'],
True)

Expand Down Expand Up @@ -81,6 +82,7 @@ def post(self):
data = prepare_dataset(tokens,
current_app.word_to_id,
current_app.char_to_id,
{},
current_app.model.parameters['lower'],
True)

Expand Down
4 changes: 2 additions & 2 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,8 @@ def build(self,
transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

small = -1000
b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
b_s = np.array([[small] * n_tags + [0, small]]).astype(theano.config.floatX)
e_s = np.array([[small] * n_tags + [small, 0]]).astype(theano.config.floatX)
observations = T.concatenate(
[tags_scores, small * T.ones((s_len, 2))],
axis=1
Expand Down
2 changes: 1 addition & 1 deletion optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def rmsprop(self, cost, params, lr=0.001, rho=0.9, eps=1e-6):
lr = theano.shared(np.float32(lr).astype(floatX))

gradients = self.get_gradients(cost, params)
accumulators = [theano.shared(np.zeros_like(p.get_value()).astype(np.float32)) for p in params]
accumulators = [theano.shared(np.zeros_like(p.get_value()).astype(floatX)) for p in params]

updates = []

Expand Down
18 changes: 10 additions & 8 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@

# Initialize model
model = Model(parameters=parameters, models_path=models_path)
logging.info("Model location: %s" % model.model_path)
logging.info("Model location: %s", model.model_path)

# Data parameters
lower = parameters['lower']
Expand All @@ -155,22 +155,24 @@
##update_tag_scheme(dev_sentences, tag_scheme)
##update_tag_scheme(test_sentences, tag_scheme)

all_sentences = train_sentences + dev_sentences + test_sentences

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
dico_words_train = word_mapping(train_sentences, lower)[0]
dico_words_train = word_mapping(all_sentences, lower)[0]
dico_words, word_to_id, id_to_word = augment_with_pretrained(
dico_words_train.copy(),
parameters['pre_emb'],
None
)
else:
dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
dico_words, word_to_id, id_to_word = word_mapping(all_sentences, lower)
dico_words_train = dico_words

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
dico_chars, char_to_id, id_to_char = char_mapping(all_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(all_sentences)

# Index data
train_data = prepare_dataset(
Expand Down Expand Up @@ -229,12 +231,12 @@
logging.info("Score on dev: %.5f", dev_score)
logging.info("Score on test: %.5f", test_score)
if dev_score > best_dev:
logging.info("New best score on dev: %f. (Previously: %f)", dev_score, best_dev)
best_dev = dev_score
logging.info("New best score on dev.")
logging.info("Saving model to disk...")
model.save()
if test_score > best_test:
logging.info("New best score on test: %f. (Previously: %f)", test_score, best_test)
best_test = test_score
logging.info("New best score on test.")
logging.info("Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs)))
logging.info("Epoch %i done. Average cost: %f", epoch, np.mean(epoch_costs))
model.save()
2 changes: 1 addition & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def set_values(name, param, pretrained):
)
param.set_value(np.reshape(
pretrained, param_value.shape
).astype(np.float32))
).astype(theano.config.floatX))


def shared(shape, name):
Expand Down

0 comments on commit c7a2557

Please sign in to comment.