chatbot.py

#Building a chatbot with Deep NLP


#importing the libraries
import numpy as np
import tensorflow as tf
import re
import time


#######Part 1 Data Preprocessing #######


##### Import Dataset####
lines=open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations=open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

#####creating a dictionary that maps each line and its id
id2line={}
for line in lines:
    _line=line.split(' +++$+++ ')
    if len(_line)==5:
        id2line[_line[0]]=_line[4]

########Creating the list of all conversation
conversation_ids=[]
for conversation in conversations[:-1]:
    _conversation=conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    conversation_ids.append(_conversation.split(','))
    
        
### Getting seperatly question and answer list
questions=[]
answers=[]
for conversation in conversation_ids:
    for i in range(len(conversation)-1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

####Doing the cleaning of text
def clean_text(text):
    text=text.lower()
    text=re.sub(r"i'm","i am",text)
    text=re.sub(r"he's","he is", text)
    text=re.sub(r"she's","she is", text)
    text=re.sub(r"that's","that is", text)
    text=re.sub(r"what's","what is", text)
    text=re.sub(r"where's","where is", text)
    text=re.sub(r"\'ll"," will", text)
    text=re.sub(r"\'ve"," have", text)
    text=re.sub(r"\'re"," are", text)
    text=re.sub(r"\'d"," would", text)
    text=re.sub(r"won't","will not",text)
    text=re.sub(r"can't","cannot",text)
    text=re.sub(r"[~!@#$%^&*()_+-={}()|\";?/><.,]","",text)
    return text

#cleaning the question
clean_questions=[]
for question in questions:
    clean_questions.append(clean_text(question))


#cleaning the answer
clean_answers=[]
for answer in answers:
    clean_answers.append(clean_text(answer))
    
    
# word count in a word2count dictionary
    
# for Question
word2count={}
for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] =1
        else:
            word2count[word]+=1
 
# for Answer
word2count={}
for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] =1
        else:
            word2count[word]+=1

#creating two diff dictionaries that maps the questions word and answers mode to a unique integer

#for Question
threshold=20
questionswords2int={}
word_number=0
for word, count in word2count.items():
    if count >= threshold:
        questionswords2int[word]=word_number
        word_number+=1

#for answers
answerswords2int={}
word_number=0
for word, count in word2count.items():
    if count >= threshold:
        answerswords2int[word]=word_number
        word_number+=1

#Adding last two tokens to the dictionaries
tokens=['<PAD>','<EOS>','<OUT>','<SOS>']
for token in tokens:
    questionswords2int[token]=len(questionswords2int)+1

for token in tokens:
    answerswords2int[token]=len(answerswords2int)+1
        
# creating the inverse dictionaries

answersint2words={w_i: w for w,w_i in answerswords2int.items()}

#adding the End Of String token to the end of every answer
for i in range(len(clean_answers)):
    clean_answers[i]+=' <EOS>'
    
#translating all the question and answers to int 
#and replacing all the outfitted word with <OUT> 
    
questions_into_int=[]
for question in clean_questions:
    ints=[]
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)

answers_into_int=[]
for answer in clean_answers:
    ints=[]
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int['<OUT>'])
        else:
            
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)
#sorting question and answer by length of question
sorted_clean_questions=[]
sorted_clean_answers=[]
for length in range(1,25+1):
    for i in enumerate(questions_into_int):
        if len(i[1])==length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])