-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathschemes_model.py
120 lines (104 loc) · 4.91 KB
/
schemes_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html
import gensim
import pandas as pd
import spacy
import operator
import re
import string
import json
import codecs
df_schemes = pd.read_csv('df.csv', encoding='cp1252')#Needed to map the scheme names back to
dictionary = gensim.corpora.Dictionary.load('weights_generation/dictionary') #Needed to construct mappings from BOW of the query term to the dictionary which is already preloaded
schemes_tfidf_model = gensim.models.TfidfModel.load("weights_generation/tfidf.model") #Needed to feed into the LSI model
schemes_lsi_model = gensim.models.LsiModel.load("weights_generation/lsi.model") #Final model
schemes_lsi_corpus = gensim.corpora.MmCorpus('weights_generation/schemes_lsi_model_mm') #Needed to create the matrix similarity index
spacy_nlp = spacy.load('en_core_web_sm')
#For mentalhealth
import pickle
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
tvec_optimised = pickle.load(open('weights_generation/tvec', 'rb'))
mhmodel = pickle.load(open('weights_generation/mentalhealth', 'rb'))
#create list of punctuations and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS
to_delete = ["alone","themselves"]
for elem in to_delete:
stop_words.discard(elem)
stop_words.add("client")
def spacy_tokenizer(sentence):
#remove distracting single quotes
sentence = re.sub('\'','',sentence)
#replace extra spaces with single space
sentence = re.sub(' +',' ',sentence)
#remove unwanted lines starting from special charcters
sentence = re.sub(r'\n: \'\'.*','',sentence)
sentence = re.sub(r'\n!.*','',sentence)
sentence = re.sub(r'^:\'\'.*','',sentence)
#remove non-breaking new line characters
sentence = re.sub(r'\n',' ',sentence)
#remove punctuations
sentence = re.sub(r'[^\w\s]',' ',sentence)
#creating token object
tokens = spacy_nlp(sentence)
#lower, strip and lemmatize
tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
#remove stopwords, and exclude words less than 2 characters
tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]
#return tokens
return tokens
#create stemmer for mentalhealth
porter = PorterStemmer()
def stemSentence(sentence):
token_words=word_tokenize(sentence)
token_words
stem_sentence=[]
for word in token_words:
stem_sentence.append(porter.stem(word))
stem_sentence.append(" ")
return "".join(stem_sentence)
from gensim.similarities import MatrixSimilarity
schemes_index = MatrixSimilarity(schemes_lsi_corpus, num_features = schemes_lsi_corpus.num_terms)
#Search similarity
from operator import itemgetter
counter = 0
def search_similar_schemes(search_term, x):
global counter
query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
query_tfidf = schemes_tfidf_model[query_bow]
query_lsi = schemes_lsi_model[query_tfidf]
schemes_index.num_best = 50
schemes_list = schemes_index[query_lsi]
schemes_list.sort(key=itemgetter(1), reverse=True)
schemes_names = []
for j, scheme in enumerate(schemes_list):
schemes_names.append (
{
'Relevance': round((scheme[1] * 100),2),
'Scheme': df_schemes['Scheme'][scheme[0]],
'Description': df_schemes['Description'][scheme[0]],
'Agency': df_schemes['Agency'][scheme[0]],
'Image': df_schemes['Image'][scheme[0]],
'Link': df_schemes['Link'][scheme[0]],
'What it gives': df_schemes['What it gives'][scheme[0]],
'Scheme Type': df_schemes['Scheme Type'][scheme[0]]
}
)
if j == (schemes_index.num_best-1):
break
output = pd.DataFrame(schemes_names, columns=['Relevance','Scheme','Description', 'Agency', 'Image', 'Link', 'What it gives', 'Scheme Type'])
mhprob = mhmodel.predict_proba(tvec_optimised.transform([str(stemSentence(search_term))]).todense())[0][1]
output['Relevance'] = output.apply(lambda y: (y['Relevance'] * 1.05 ) if ((('mental health' or 'counselling' or 'emotional care' or 'casework' in y['Scheme Type'].lower()) or
('mental health' or 'counselling' or 'emotional care' or 'casework' in y['What it gives'].lower())) and
(mhprob > 0.55)) else y['Relevance'], axis=1)
output = output.sort_values(by=['Relevance'], ascending= False)
output = output[output['Relevance']>x]
jsonobject = output.to_json(orient = "records") #.encode('unicode-escape').decode('unicode-escape')
counter = counter + 1
jsonobject = {
"data": json.loads(jsonobject),
"mh": mhprob
}
return jsonobject