-
Notifications
You must be signed in to change notification settings - Fork 2
/
features.py
155 lines (106 loc) · 4.95 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import csv
import time
import pandas as pd
import re
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
'''
### Fields in instances.jsonl:
{
"id": "<instance id>",
"postTimestamp": "<weekday> <month> <day> <hour>:<minute>:<second> <time_offset> <year>",
"postText": ["<text of the post with links removed>"],
"postMedia": ["<path to a file in the media archive>"],
"targetTitle": "<title of target article>",
"targetDescription": "<description tag of target article>",
"targetKeywords": "<keywords tag of target article>",
"targetParagraphs": ["<text of the ith paragraph in the target article>"],
"targetCaptions": ["<caption of the ith image in the target article>"]
}
### Fields in truth_data.jsonl:
{
"id": "<instance id>",
"truthJudgments": [<number in [0,1]>],
"truthMean": <number in [0,1]>,
"truthMedian": <number in [0,1]>,
"truthMode": <number in [0,1]>,
"truthClass": "clickbait | no-clickbait"
}
'''
INSTANCES = 'instances.jsonl'
TRUTH = 'truth.jsonl'
FEATURES = 'features.csv'
analyser = SentimentIntensityAnalyzer()
def sentiment_scores(sentence):
score = analyser.polarity_scores(sentence)
return 1 if score['pos'] > score['neg'] else 0
def tokenize_words(text):
return nltk.word_tokenize(text)
def find_pos_tags(tokens):
return nltk.pos_tag(tokens)
def find_nnp(tups):
return sum([1 if tup[1]=='NNP' else 0 for tup in tups])
def find_nn(tups):
return sum([1 if tup[1]=='NN' else 0 for tup in tups])
def find_dt(tups):
return sum([1 if tup[1]=='DT' else 0 for tup in tups])
def find_rb(tups):
return sum([1 if tup[1]=='RB' else 0 for tup in tups])
def find_prp(tups):
return sum([1 if tup[1]=='PRP' else 0 for tup in tups])
def find_vbd(tups):
return sum([1 if tup[1]=='VBD' else 0 for tup in tups])
def find_vbp(tups):
return sum([1 if tup[1]=='VBP' else 0 for tup in tups])
def good_words(tups):
return [tup[0] for tup in tups if tup[1] in ['DT', 'RB', 'PRP']]
def char_length(line):
return len(line)
def word_length(line):
return len(line.split())
def distance(a, b):
return abs(a - b)
def extractFeatures(inDir):
print("Starting feature generation")
start_time = time.time()
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
data = pd.read_json(os.path.join(inDir, INSTANCES), dtype={'id': str}, lines=True)
data['postText'] = data['postText'].apply(lambda x: x[0])
data['processed_postText'] = data.postText.apply(lambda x: find_pos_tags(tokenize_words(''.join(x).lower())))
data['processed_targetDescription'] = data.targetDescription.apply(lambda x: find_pos_tags(tokenize_words(x.lower())))
data['processed_targetTitle'] = data.targetTitle.apply(lambda x: find_pos_tags(tokenize_words(x.lower())))
features = pd.DataFrame()
features['id'] = data['id']
if (os.path.isfile(os.path.join(inDir, TRUTH))):
print("Truth file found")
truth_data = pd.read_json(os.path.join(inDir, TRUTH), dtype={'id': str}, lines=True)
features = features.merge(truth_data[['id', 'truthClass']], on='id')
fields = ['postText', 'targetTitle', 'targetDescription', 'targetKeywords']
fields_2 = ['processed_postText', 'processed_targetDescription', 'processed_targetTitle']
for f in fields:
features['chars_' + f] = data[f].apply(char_length)
features['wrds_' + f] = data[f].apply(word_length)
features['#?_' + f] = data[f].apply(lambda x: x.count('?'))
features['#!_' + f] = data[f].apply(lambda x: x.count('!'))
features['##_' + f] = data[f].apply(lambda x: len(re.findall(r"(?<!#)#(?![#\s])", x)))
features['#^\d_' + f] = data[f].apply(lambda x: len(re.findall(r"^\d.*", x)))
for y in fields[0:3]:
features['sentiment_' + y] = data[y].apply(sentiment_scores)
for x in fields_2:
features['count_nn' + x] = data[x].apply(find_nn)
features['count_nnp' + x] = data[x].apply(find_nnp)
features['count_dt' + x] = data[x].apply(find_dt)
features['count_prp' + x] = data[x].apply(find_prp)
features['count_rb' + x] = data[x].apply(find_rb)
features['count_vbd' + x] = data[x].apply(find_vbd)
features['count_vbp' + x] = data[x].apply(find_vbp)
for i in range(len(fields)):
for j in range(i + 1, len(fields)):
features['d_chars_' + fields[i] + '-' + fields[j]] = distance(features["chars_" + fields[i]], features['chars_' + fields[j]])
features['d_wrds_' + fields[i] + '-' + fields[j]] = distance(features["wrds_" + fields[i]], features['wrds_' + fields[j]])
features.to_csv(FEATURES, index=False)
print("Feature generation took {}".format(time.time() - start_time))
if __name__ == "__main__":
extractFeatures('data-medium')