-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathdata_prepare.py
122 lines (106 loc) · 4.47 KB
/
data_prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Name: data_prepare.py
# Author: Yandi LI
# Created: 2014/5/2
# Objective: Create the dataset for FM-BPR() model, save in pickle format
# Use: python *.py [options] <outfile>
# Require : table:context_cube, table:checkin_venue
#-----------------------------------------------------------------------------
import psycopg2, sys
from numpy import array
from scipy.sparse import csr_matrix
from numpy import unique
from optparse import OptionParser
def query_table(con, query):
"""
send a query to the database, return a list of incidences
"""
cur = con.cursor()
cur.execute(query)
result = cur.fetchall()
con.commit()
return result
def create_data(options, context_features = ['hour','conds','dow']):
try:
con = psycopg2.connect(database=options.database,
host=options.host,
user=options.user,
password=options.password)
query = 'SELECT (' +\
'row_number() OVER ())-1 AS cid,' +\
', '.join(context_features) +\
' FROM '+ options.tb_name_c +\
' GROUP BY '+ ', '.join(context_features)+\
' ORDER BY cid'
A = query_table(con, query) # cid-feature_vector matrix
query = 'WITH cid_attr as (\
SELECT (row_number() OVER ())-1 AS cid,'+\
', '.join(context_features) +\
' FROM '+ options.tb_name_c +\
' GROUP BY ' + ', '.join(context_features) +\
')' +\
"""SELECT cid, vid, COUNT(tweetid)::int AS count
FROM """ + options.tb_name_c + """ a
NATURAL JOIN cid_attr c
GROUP BY cid, vid
ORDER BY cid"""
Y = query_table(con,query) # cid-vid-count matrix
query = 'SELECT DISTINCT vid, tid'+\
' FROM ' + options.tb_name_v +\
' ORDER BY vid'
T = query_table(con, query) # vid-tid matrix
except psycopg2.DatabaseError, e:
print 'Error %s' % e
sys.exit(1)
else:
print "Loading database: Success!"
finally:
if con:
con.close()
Y = array(Y)
cid_attributes = array(A)[:,1:] # omit index column
vid_attributes = array(T,dtype='int')[:,1:] # omit index column
data = {'triple':csr_matrix((Y[:,2],(Y[:,0],Y[:,1]))),\
'c_attr':cid_attributes,\
'v_attr':vid_attributes}
return data
if __name__ == '__main__':
import pickle
###############################
## Argument Parsing
###############################
optparser = OptionParser(usage="""%prog [options] <output_pickle_file>
Query context_cube table to build design matrices in pickle format
""")
defaults = {'host':'localhost',\
'user':'postgres',\
'password':'',
'tb_name_c':'context_cube',
'tb_name_v':'checkin_venue',
'features':'hour conds dow'}
optparser.set_defaults(**defaults)
optparser.add_option('-d','--database',dest='database',
type='string', help='Database name.')
optparser.add_option('-H','--host',dest='host',
type='string', help='Host address. Default is %s.' % repr(defaults['host']))
optparser.add_option('-u','--user',dest='user',
type='string', help='User name. Default is %s.'% repr(defaults['user']))
optparser.add_option('-p','--password',dest='password',
type='string', help='Password. Default is empty')
optparser.add_option('-t','--table',dest='tb_name_c',
type='string', help='Table name of the context data cube. Default is %s.'%repr(defaults['tb_name_c']))
optparser.add_option('-v','--vtable',dest='tb_name_v',
type='string', help='Table name of the venue checkin table. Default is %s.'%repr(defaults['tb_name_v']))
optparser.add_option('-f','--features',dest='features',
type='string', help='Select columns to include as context features. Default is %s.'%repr(defaults['features']))
options, (outfile,) = optparser.parse_args()
if not options.database:
optparser.error('Database name not given.')
###############################
## Output to file
###############################
context_features = options.features.split() # select columns to include as context features
data = create_data(options, context_features)
pickle.dump(data, open(outfile,'w'))
print 'Well Done.'