-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpredict.py
84 lines (72 loc) · 2.46 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
###############################
## IMPORTS
###############################
import pandas as pd
import numpy as np
import re
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.layers.embeddings import Embedding
from keras.utils.data_utils import get_file
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Merge, Dropout, concatenate, Dense, BatchNormalization, Lambda, TimeDistributed, Dot, dot
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint
from keras.models import model_from_json
from sklearn.model_selection import train_test_split
from zipfile import ZipFile
from os.path import expanduser, exists
import datetime
import time
import json
import click
@click.group()
def cli():
""" This is a simple cli tool to predict if two quora sentences are duplicate. """
pass
def load_siamese_model():
json_file = open('model.json','r')
loaded_model_json = json_file.read()
json_file.close()
# load model
model = model_from_json(loaded_model_json)
# load weights
model.load_weights('question_pairs_weights_type1_final.h5')
return model
def convert_text_to_index_array(text, dictionary):
words = text_to_word_sequence(text)
wordIndices = []
for word in words:
if word in dictionary:
wordIndices.append(dictionary[word])
else:
print("'%s' not in training corpus; ignoring." %(word))
return wordIndices
@cli.command()
@click.option('--ques1', help='Give the first question.')
@click.option('--ques2', help='Give the second question.')
def find_if_duplicate_questions(ques1, ques2):
"""This prints yes if the two input questions are duplicate, else prints no."""
tokenizer = Tokenizer(num_words=100000)
with open('dictionary.json', 'r') as dictionary_file:
dictionary = json.load(dictionary_file)
MAX_SEQUENCE_LENGTH = 130
q1_word_seq = convert_text_to_index_array(ques1,dictionary)
q1_word_seq = [q1_word_seq]
q2_word_seq = convert_text_to_index_array(ques2,dictionary)
q2_word_seq = [q2_word_seq]
q1_data = pad_sequences(q1_word_seq, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(q2_word_seq, maxlen=MAX_SEQUENCE_LENGTH)
model = load_siamese_model()
print("model loaded...")
pred = model.predict([q1_data
,q2_data])
print(pred)
if(pred > 0.5):
print("not duplicate.")
else:
print("duplicate.")
if __name__ == "__main__":
cli()