#Dataset Description :
""" ./Datasets/Hate_Counter_Dataset.csv --- contains the Tweet IDs of Hate users mapped with corresponding Counter users. There are in total 1290 pairs of tweets by hate and counter users.
./Models/Catboost-model-tfidf.joblib --- is the catboost model which is trained on only the tfidf vector. This model can be used for classification in case we don't have any information regarding the user, only his tweets are available.
./Models/Catboost-model.joblib --- is our best performing model which is trained on all the features.
./Models/char_vocab.pkl --- contains character level vocabulary trained on 6 million tweets.
./Models/word_vocab.pkl --- contains word level vocabulary trained on 6 million tweets.
./Models/Reproducing_Results.ipynb --- contains codes which can be used to reproduce our results.
"""
#Code Reproduction : ##Make sure you have got all Dependencies before running the model
import pickle from sklearn.metrics import classification_report from sklearn.feature_extraction.text import TfidfVectorizer import preprocessor as prep ###### Twitter preprocessor from sklearn.externals import joblib # to save & load the model
#We have got the best accuracy score on CatBoostClassifier. #You can always reproduce the results by importing the model by following commands below:
#**********************************************************************************************************************************
#Model-I """ Description : In this model we only used Tfidf Vectors (Generated from both Word and Character Vocabulary) as our features
"""
#Feature Preparation:
""" char_vocab.pkl - contains character level vocabulary trained on 6 million tweets. word_vocab.pkl - contains word level vocabulary trained on 6 million tweets.
"""
cbc = joblib.load('Refined_Project_Dataset/Catboost-model-tfidf.joblib')
"""
- x_test : list containing all tweets of users
- y_test : contains binary class values as 1: Hate | 0:Counter
""" x_test = pickle.load(open('Refined_Project_Dataset/xw_test_tfidf.pkl','rb')) y_test = pickle.load(open('Refined_Project_Dataset/yw_test_tfidf.pkl','rb'))
prep_tweets = [] for tweet in tqdm(x_test): prep_tweets.append(prep.tokenize(x_test)) #------------------------------------------------------
word_vectorizer = TfidfVectorizer(vocabulary=pickle.load(open("word_vocab.pkl", "rb")) # pretrained vocabulary from 6 million tweets on word level char_vectorizer = TfidfVectorizer(vocabulary=pickle.load(open("char_vocab.pkl", "rb")) # pretrained vocabulary from 6 million tweets on char level
char_features = char_vectorizer.transform(prep_tweets) word_features = word_vectorizer.transform(prep_tweets)
features = np.c_[np.asarray(word_features.todense()),np.asarray(char_features.todense())]
#print classification report of your model's performance: print(classification_report(y_test,cbc.predict(x_test))) print('Accuracy:',cbc.score(x_test,y_test))
#******************************************************************************************************************************************
""" Description : This is model is where we have got our best accuracy results on testing data
"""
""" Features Description : [Orderly] i)Tfidf : Word Vectors + Char Vectors [Orderly] ii)Lexicon Features [Empath] iii)Sentiment Features [Vader] + TextBlob iv)User History:[Order is maintained] 1)followers_count/tweet 2)favourites_count/tweet 3)friends_count/tweet 4)listed_count/tweet 5)statuses_count/tweet 6)verified
"""
cbc = joblib.load('Refined_Project_Dataset/Catboost-model.joblib')
x_test = pickle.load(open('Refined_Project_Dataset/xw_test.pkl','rb')) y_test = pickle.load(open('Refined_Project_Dataset/yw_test.pkl','rb'))
#print classification report of your model's performance: print(classification_report(y_test,cbc.predict(x_test))) print('Accuracy:',cbc.score(x_test,y_test))