-
Notifications
You must be signed in to change notification settings - Fork 13
/
feature_stack.py
45 lines (38 loc) · 2.36 KB
/
feature_stack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from scipy.sparse import hstack
from qc.dataprep.text_features import text_ft_arr
def get_ft_obj(data_type: str, rp: str, ml_algo: str, cat_type: str, data: list = None):
"""
This method gets the vectorized features and stacks them horizontally.
:argument:
:param data_type: String either `training` or `test`.
:param rp: Absolute path of the root directory of the project.
:param ml_algo: Machine algorithm for which the dataprep is running.
:param cat_type: Type of categorical class `coarse` or any of the 6 main classes.
(`abbr` | `desc` | `enty` | `hum` | `loc` | `num`)
:param data: list of Docs that are output of NLP process, provide this attribute in case data_type='api'
:return:
boolean_flag: True for successful operation.
x_all_ft: numpy array of features to be feed to a Machine Learning algorithm.
"""
# NOTE:
# 1. Part of Speech (pos) is same as `tag` hence no needed, and reduces accuracy.
# More information can be found here - [https://spacy.io/api/token]
# 2. Direct list words, is not a good feature,
# instead lemma (root form) of the word is more useful as a feature.
# 3. Is alphabet or not feature is reducing the accuracy by a bit,
# hence not used for now.
# -------------------------------------------Experimental code------------------------------------------------------
# Here you can select and tune feature stack.
# p_ft = text_ft_arr(data_type, rp, "pos", ml_algo, cat_type, data)[1]
# w_ft = text_ft_arr(data_type, rp, "word", ml_algo, cat_type, data)[1]
# a_ft = text_ft_arr(data_type, rp, "alpha", ml_algo, cat_type, data)[1]
l_ft = text_ft_arr(data_type, rp, "lemma", ml_algo, cat_type, data)[1]
t_ft = text_ft_arr(data_type, rp, "tag", ml_algo, cat_type, data)[1]
d_ft = text_ft_arr(data_type, rp, "dep", ml_algo, cat_type, data)[1]
s_ft = text_ft_arr(data_type, rp, "shape", ml_algo, cat_type, data)[1]
st_ft = text_ft_arr(data_type, rp, "stop", ml_algo, cat_type, data)[1]
n_ft = text_ft_arr(data_type, rp, "ner", ml_algo, cat_type, data)[1]
x_all_ft = hstack([t_ft, d_ft, n_ft, st_ft, l_ft, s_ft])
# Feature stack ends here.
# ------------------------------------------------------------------------------------------------------------------
return x_all_ft