-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews_keyword.py
48 lines (39 loc) · 1.47 KB
/
news_keyword.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import csv
import collections
import jieba
import pandas as pd
data = pd.read_csv('ESUN_news.csv')
cnt_keyword = collections.Counter()
cnt_non_keyword = collections.Counter()
news_num = len(data.index)
moneynews_num = 0
nonnews_num = 0
for index, row in data.iterrows():
if row['name'] != '[]':
seg_list = jieba.cut(row['content'])
seg_list = list(dict.fromkeys(seg_list))
cnt_content = collections.Counter(seg_list)
print (cnt_content)
cnt_keyword.update(cnt_content)
moneynews_num += 1
else:
seg_list = jieba.cut(row['content'])
seg_list = list(dict.fromkeys(seg_list))
cnt_content = collections.Counter(seg_list)
cnt_non_keyword.update(cnt_content)
nonnews_num += 1
print ('news_num:', news_num)
print ('moneynews_num:', moneynews_num)
print ('nonnews_num:', nonnews_num)
keyword = pd.DataFrame({'money_news':cnt_keyword, 'none_news':cnt_non_keyword})
keyword = keyword.fillna(0)
keyword['cond_prob'] = keyword['money_news']/(keyword['money_news'] + keyword['none_news'])
keyword = keyword.sort_values(by = 'cond_prob', ascending = False)
# keyword.to_csv('keyword.csv')
prob_threshold = 0.4
freq_threshold = 5
index_noprob = keyword[ keyword['cond_prob'] < prob_threshold ].index
keyword.drop(index_noprob , inplace=True)
index_lowfreq = keyword[ keyword['money_news'] + keyword['none_news'] < freq_threshold ].index
keyword.drop(index_lowfreq , inplace=True)
keyword.to_csv('keyword_filted.csv')