-
Notifications
You must be signed in to change notification settings - Fork 0
/
chatbot.py
163 lines (125 loc) · 4.33 KB
/
chatbot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#Building a chatbot with Deep NLP
#importing the libraries
import numpy as np
import tensorflow as tf
import re
import time
#######Part 1 Data Preprocessing #######
##### Import Dataset####
lines=open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations=open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')
#####creating a dictionary that maps each line and its id
id2line={}
for line in lines:
_line=line.split(' +++$+++ ')
if len(_line)==5:
id2line[_line[0]]=_line[4]
########Creating the list of all conversation
conversation_ids=[]
for conversation in conversations[:-1]:
_conversation=conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
conversation_ids.append(_conversation.split(','))
### Getting seperatly question and answer list
questions=[]
answers=[]
for conversation in conversation_ids:
for i in range(len(conversation)-1):
questions.append(id2line[conversation[i]])
answers.append(id2line[conversation[i+1]])
####Doing the cleaning of text
def clean_text(text):
text=text.lower()
text=re.sub(r"i'm","i am",text)
text=re.sub(r"he's","he is", text)
text=re.sub(r"she's","she is", text)
text=re.sub(r"that's","that is", text)
text=re.sub(r"what's","what is", text)
text=re.sub(r"where's","where is", text)
text=re.sub(r"\'ll"," will", text)
text=re.sub(r"\'ve"," have", text)
text=re.sub(r"\'re"," are", text)
text=re.sub(r"\'d"," would", text)
text=re.sub(r"won't","will not",text)
text=re.sub(r"can't","cannot",text)
text=re.sub(r"[~!@#$%^&*()_+-={}()|\";?/><.,]","",text)
return text
#cleaning the question
clean_questions=[]
for question in questions:
clean_questions.append(clean_text(question))
#cleaning the answer
clean_answers=[]
for answer in answers:
clean_answers.append(clean_text(answer))
# word count in a word2count dictionary
# for Question
word2count={}
for question in clean_questions:
for word in question.split():
if word not in word2count:
word2count[word] =1
else:
word2count[word]+=1
# for Answer
word2count={}
for answer in clean_answers:
for word in answer.split():
if word not in word2count:
word2count[word] =1
else:
word2count[word]+=1
#creating two diff dictionaries that maps the questions word and answers mode to a unique integer
#for Question
threshold=20
questionswords2int={}
word_number=0
for word, count in word2count.items():
if count >= threshold:
questionswords2int[word]=word_number
word_number+=1
#for answers
answerswords2int={}
word_number=0
for word, count in word2count.items():
if count >= threshold:
answerswords2int[word]=word_number
word_number+=1
#Adding last two tokens to the dictionaries
tokens=['<PAD>','<EOS>','<OUT>','<SOS>']
for token in tokens:
questionswords2int[token]=len(questionswords2int)+1
for token in tokens:
answerswords2int[token]=len(answerswords2int)+1
# creating the inverse dictionaries
answersint2words={w_i: w for w,w_i in answerswords2int.items()}
#adding the End Of String token to the end of every answer
for i in range(len(clean_answers)):
clean_answers[i]+=' <EOS>'
#translating all the question and answers to int
#and replacing all the outfitted word with <OUT>
questions_into_int=[]
for question in clean_questions:
ints=[]
for word in question.split():
if word not in questionswords2int:
ints.append(questionswords2int['<OUT>'])
else:
ints.append(questionswords2int[word])
questions_into_int.append(ints)
answers_into_int=[]
for answer in clean_answers:
ints=[]
for word in answer.split():
if word not in answerswords2int:
ints.append(answerswords2int['<OUT>'])
else:
ints.append(answerswords2int[word])
answers_into_int.append(ints)
#sorting question and answer by length of question
sorted_clean_questions=[]
sorted_clean_answers=[]
for length in range(1,25+1):
for i in enumerate(questions_into_int):
if len(i[1])==length:
sorted_clean_questions.append(questions_into_int[i[0]])
sorted_clean_answers.append(answers_into_int[i[0]])