-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patht_wdcnt_160101_160102.py
131 lines (103 loc) · 3.63 KB
/
t_wdcnt_160101_160102.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#from article_2_vector_word_count import *
#from collections import defaultdict
import lda
import sqlite3
import numpy as np
#from scipy.sparse import csr_matrix, save_npz
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
import time
import TXTnlp
from mpi4py import MPI
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
self.word_pos=pos_tag(word_tokenize(doc))
return [self.wnl.lemmatize(w,get_wordnet_pos(p)) for w,p in self.word_pos
if len(w)>=3 and not w.isdigit() and not is_float(w) ]
# -----------------------------------
# Extracting features from database
# -----------------------------------
def article_extractor(sqlite_file,start_date, end_date):
conn=sqlite3.connect(sqlite_file)
c=conn.cursor()
articles_2016=c.execute("SELECT article FROM articles WHERE date BETWEEN ? AND ?", (start_date, end_date))
articles_tuple=articles_2016.fetchall()
conn.close()
articles=[item[0] for item in articles_tuple]
return articles
directory='/Users/leihao/Downloads/'
sqlite_file=directory+'nasdaq.db'
# -----------------
# Get the MPI rank
# -----------------
#comm = MPI.COMM_WORLD
#size = comm.Get_size()
#rank = comm.Get_rank()
#start_date, end_date='2016-01-01', '2016-01-02' if rank==0 else ('2016-01-03', '2016-01-04')
#t0=time.time()
#articles=article_extractor(sqlite_file,start_date, end_date)
#t1=time.time()
#print("Vectorizer takes {0:.2f} seconds".format(t1-t0))
with sqlite3.connect(sqlite_file) as conn:
c=conn.cursor()
articles_2016=c.execute("SELECT article, date FROM articles WHERE date BETWEEN '2016-01-01' AND '2016-01-02' ")
articles_tuple=articles_2016.fetchall()
art_list=[item[0] for item in articles_tuple]
date_list=[item[1] for item in articles_tuple]
none_ind=art_list.index(None)
art_list.pop(none_ind)
date_list.pop(none_ind)
txtlen_list=[len(TXTnlp.token(TXTnlp.TextBlob(item))) for item in art_list]
import pickle
with open(directory+'NumOfWordsPerArticle_all.pickle', 'wb+') as f:
pickle.dump(txtlen_list,f)
with open(directory+'DatesOfArticle_all.pickle', 'wb+') as f:
pickle.dump(date_list,f)
nonzero_ind=np.nonzero(np.array(txtlen_list))
nonzero_ele=np.array(txtlen_list)[nonzero_ind]
nonzero_ele.mean()
nonzero_ele.std()
proc_cnt, proc_vocab = [], []
wdnum,lemnum=0,0
for item in articles_tuple:
item=articles_tuple[1]
txt = TXTnlp.TextBlob(item[0])
### Tokens and POS tags
txttok = TXTnlp.token(txt)
wdnum +=len(txttok)
txtpos = TXTnlp.pos(txt)
### WordNet lemmatizer
# Create namespace
article = TXTnlp.types.SimpleNamespace()
# Assign parts of namespace
#article.blob = txt
article.tok = txttok
article.pos = txtpos
# Lemmatize
article = TXTnlp.lem(article)
atcl_cnt,atcl_vocab = TXTnlp.cntr(article.lem)
procatcl_cnt)
proc_vocab.update(atcl_vocab)
vocab=set()
vocab=comm.Allgather(proc_vocab,vocab)
#convert vocab dictionary to vocab list
vocab=list(vocab)
ind_cnt=[]
for item in proc_cnt:
atcl_ind_cnt=[(vocab.index(word),cnt) for word,cnt in item]
ind_cnt.append(atcl_ind_cnt)