-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextAnalysis.py
114 lines (94 loc) · 4.27 KB
/
TextAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import statistics as st
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.datasets import load_files
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
import os
import time
#Credit for a significant part of the algorithm to https://stackabuse.com/text-classification-with-python-and-scikit-learn
def Training(FilePath, Method = False):
FilePathTargets = os.listdir(FilePath) #get
if len(FilePathTargets) <= 1:
print("Please put text in target folders")
exit()
dataset=None
try:
dataset = load_files(FilePath) #Loads each folder inside the directory as a category e.g. if I have Travel and NotTravel folders it will use those categories
except:
print("Please put text in target folders")
exit()
Categories = dataset.target_names
print(Categories)
x, y = dataset.data, dataset.target
Categories = dataset.target_names
documents = []
if Method is False: #uses simple preprocess from the gensim package
from gensim.utils import simple_preprocess
for sen in x:
simple_preprocess(sen)
documents=x
elif Method is True: #uses a more elaborate preprocessing using a lemmatizer and regular expressions
from nltk.stem import WordNetLemmatizer
import re
stemmer = WordNetLemmatizer()
for sen in range(0, len(x)):
# Remove all the special characters
document = re.sub(r'\W', ' ', str(x[sen]))
# remove all single characters
document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
# Remove single characters from the start
document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
# Substituting multiple spaces with single space
document = re.sub(r'\s+', ' ', document, flags=re.I)
# Removing prefixed 'b'
document = re.sub(r'^b\s+', '', document)
# Converting to Lowercase
document = document.lower()
# Lemmatization
document = document.split()
document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)
documents.append(document)
vectorizer = CountVectorizer(min_df = 5,max_df=0.7, ngram_range=(1,3), stop_words=stopwords.words('english'))
x = vectorizer.fit_transform(documents).toarray() #converts the processed documents in a vector
#tfidfconverter = TfidfTransformer()
#x = tfidfconverter.fit_transform(x).toarray() #applies tfidf
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) #splits the corpus in test groups.
clf = MultinomialNB()
clf.fit(x_train, y_train) #fits a classifier
y_pred = clf.predict(x_test) #makes the classifier predict
print(confusion_matrix(y_test, y_pred)) #results
print(classification_report(y_test, y_pred, target_names=Categories)) #returns precision, recall and combined accuracy.
print(accuracy_score(y_test, y_pred)) #returns combined accuracy.
return accuracy_score(y_test, y_pred)
if __name__ == '__main__':
StartTime = time.time()
print("Warning: not tested on MacOs or Linux")
x = input("Do you need to download stopwords? Y/N ")
if x in "Yy":
nltk.download('stopwords')
FilePath = input("Please enter filepath: ")
Met=input("Do you want to use a complete preprocess rather than a simple preprocess? Y/N ")
if Met in "Yy":
Met = True
else:
Met=False
results = set()
while True:
n = input("Enter number of tests: ")
n = int(n)
for i in range(n):
x = Training(FilePath, Met)
results.add(x)
break
num = 0
for i in results:
num += i
print("Mean of tests: ", st.mean(results))
print("Median of tests: ", st.median(results))
print("Standard deviation: ", st.stdev(results))
print("--- %s seconds --- for training the module" % (time.time() - StartTime))