forked from iamlemec/fastpat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
firm_cluster.py
295 lines (242 loc) · 11.5 KB
/
firm_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# name matching using locally sensitive hashing (simhash)
# Simhash ported from Liang Sun (2013)
from itertools import chain, izip, imap, ifilter, repeat
from collections import defaultdict
import operator as op
import re
import sqlite3
from math import ceil
from scipy.stats import itemfreq
import numpy as np
from hashlib import md5
from distance.cdistance import levenshtein
import networkx as nx
#
# globals
#
db_fname = 'store/patents.db'
#
# weak name standardization
#
# regular expression substitutions
paren = r"'S|\(.*\)|\."
punct = r"[^\w\s]"
space = r"[ ]{2,}"
paren_re = re.compile(paren)
punct_re = re.compile(punct)
space_re = re.compile(space)
# standardize firm name
def name_standardize(name):
name_strip = name
name_strip = paren_re.sub(' ',name_strip)
name_strip = punct_re.sub(' ',name_strip)
name_strip = space_re.sub(' ',name_strip)
name_strip = name_strip.strip()
return name_strip
#
# locally sensitive hashing code
#
def hashfunc(x):
return int(md5(x).hexdigest(),16)
# k-shingles: pairs of adjacent k-length substrings (in order)
def shingle(s, k=2):
"""Generate k-length shingles of string s."""
k = min(len(s), k)
for i in range(len(s) - k + 1):
yield s[i:i+k]
class Simhash:
# dim is the simhash width, k is the tolerance
def __init__(self, dim=64, k=3, thresh=1):
self.dim = dim
self.k = k
self.thresh = thresh
self.unions = []
self.hashmaps = [defaultdict(list) for _ in range(k+1)] # defaultdict(list)
self.masks = [1 << i for i in range(dim)]
self.offsets = [self.dim // self.k * i for i in range(self.k)]
self.bin_masks = [(i == len(self.offsets) - 1 and 2 ** (self.dim - offset) - 1 or 2 ** (self.offsets[i+1]-offset) - 1) for (i,offset) in enumerate(self.offsets)]
# add item to the cluster
def add(self, item, weights=None, label=None):
# Ensure label for this item
if label is None:
label = item
# get simhash signature
simhash = self.sign(item,weights)
keyvec = self.get_keys(simhash)
# Unite labels with the same keys in the same band
matches = defaultdict(int)
for idx, key in enumerate(keyvec):
others = self.hashmaps[idx][key]
for l in others:
matches[l] += 1
others.append(label)
for out, val in matches.iteritems():
if val > self.thresh:
self.unions.append((label,out))
# compute actual simhash
def sign(self, features, weights):
if weights is None:
weights = [1.0]*len(features)
hashs = map(hashfunc,features)
v = [0.0]*self.dim
for (h,w) in izip(hashs,weights):
for i in xrange(self.dim):
v[i] += w if h & self.masks[i] else -w
ans = 0
for i in xrange(self.dim):
if v[i] >= 0:
ans |= self.masks[i]
return ans
# bin simhash into chunks
def get_keys(self, simhash):
return [simhash >> offset & mask for (offset,mask) in zip(self.offsets,self.bin_masks)]
#
# data processing routines
#
# white magic
def autodb(fname):
def wrap(f):
fvars = f.func_code.co_varnames
has_con = 'con' in fvars
has_cur = 'cur' in fvars
if has_con or has_cur:
def f1(*args,**kwargs):
con = sqlite3.connect(fname)
if has_cur:
cur = con.cursor()
if has_con:
ret = f(con=con,cur=cur,*args,**kwargs)
else:
ret = f(cur=cur,*args,**kwargs)
else:
ret = f(con=con,*args,**kwargs)
con.close()
return ret
return f1
else:
return f
return wrap
@autodb(db_fname)
def generate_names(con,cur):
# standardize compustat names
cur.execute('drop table if exists compustat_std')
cur.execute('create table compustat_std (gvkey int, year int, namestd text)')
ret = cur.execute('select gvkey,year,name from compustat')
cur.executemany('insert into compustat_std values (?,?,?)',map(lambda (gvkey,year,owner): (gvkey,year,name_standardize(owner)),ret))
# standardize patent names
cur.execute('drop table if exists patent_std')
cur.execute('create table patent_std (patnum int, namestd int)')
ret = cur.execute('select patnum,owner from patent_use')
cur.executemany('insert into patent_std values (?,?)',map(lambda (patnum,owner): (patnum,name_standardize(owner)),ret))
# standardize assignment names
cur.execute('drop table if exists assignment_std')
cur.execute('create table assignment_std (assignid int, assigneestd int, assignorstd)')
ret = cur.execute('select assignid,assignor,assignee from assignment_use')
cur.executemany('insert into assignment_std values (?,?,?)',map(lambda (assignid,assignor,assignee): (assignid,name_standardize(assignor),name_standardize(assignee)),ret))
# store unique names
cur.execute('drop table if exists owner')
cur.execute('create table owner (ownerid integer primary key asc, name text)')
cur.execute("""insert into owner(name) select namestd from compustat_std
union select namestd from patent_std
union select assigneestd from assignment_std
union select assignorstd from assignment_std""")
# map back into compustat
cur.execute('drop table if exists compustat_owner')
cur.execute('create table compustat_owner (gvkey int, year int, ownerid int)')
cur.execute('insert into compustat_owner select gvkey,year,ownerid from compustat_std join owner on compustat_std.namestd=owner.name')
# map back into patent
cur.execute('drop table if exists patent_owner')
cur.execute('create table patent_owner (patnum int, ownerid int)')
cur.execute('insert into patent_owner select patnum,ownerid from patent_std join owner on patent_std.namestd=owner.name')
# map back into assignments
cur.execute('drop table if exists assignment_owner')
cur.execute('create table assignment_owner (assignid int, assigneeid int, assignorid int)')
cur.execute("""insert into assignment_owner select assignid,assignee_owner.ownerid,assignor_owner.ownerid from assignment_std
join owner as assignee_owner on assignment_std.assigneestd=assignee_owner.name
join owner as assignor_owner on assignment_std.assignorstd=assignor_owner.name""")
con.commit()
# k = 8, thresh = 4 works well
@autodb(db_fname)
def owner_cluster(con,cur,nitem=None,reverse=True,nshingle=2,store=False,**kwargs):
c = Simhash(**kwargs)
cmd = 'select ownerid,name from owner'
if reverse:
cmd += ' order by rowid desc'
if nitem:
cmd += ' limit %i' % nitem
name_dict = {}
for (i,(ownerid,name)) in enumerate(cur.execute(cmd)):
words = name.split()
shings = list(shingle(name,nshingle))
features = shings + words
weights = list(np.linspace(1.0,0.0,len(shings))) + list(np.linspace(1.0,0.0,len(words)))
c.add(features,weights=weights,label=ownerid)
name_dict[ownerid] = name
if i%10000 == 0:
print i
ipairs = c.unions
npairs = map(lambda p: map(name_dict.get,p),ipairs)
print 'Found %i pairs' % len(ipairs)
if store:
cur.execute('drop table if exists pair')
cur.execute('create table pair (ownerid1 int, ownerid2 int, name1 text, name2 text)')
cur.executemany('insert into pair values (?,?,?,?)',imap(lambda ((o1,o2),(n1,n2)): (o1,o2,n1,n2),izip(ipairs,npairs)))
con.commit()
else:
return ipairs
# compute distances on owners in same cluster
@autodb(db_fname)
def find_components(con,cur,thresh=0.85,store=False):
cmd = 'select * from pair'
def dmetr(name1,name2):
maxlen = max(len(name1),len(name2))
ldist = levenshtein(name1,name2,max_dist=int(ceil(maxlen*(1.0-thresh))))
return 1.0 - float(ldist)/maxlen if ldist != -1 else 0.0
dists = map(lambda (o1,o2,n1,n2): (o1,o2,dmetr(n1,n2)),cur.execute(cmd))
close = map(op.itemgetter(0,1),filter(lambda (o1,o2,d): d > thresh,dists))
G = nx.Graph()
G.add_edges_from(close)
comps = sorted(nx.connected_components(G),key=len,reverse=True)
if store:
cur.execute('drop table if exists component')
cur.execute('create table component (compid int, ownerid int)')
cur.executemany('insert into component values (?,?)',chain(*[izip(repeat(cid),comp) for (cid,comp) in enumerate(comps)]))
con.commit()
else:
return comps
# must be less than 1000000 components
@autodb(db_fname)
def merge_components(con,cur):
# match owners to firms
cur.execute('drop table if exists owner_firm')
cur.execute('create table owner_firm (ownerid int, firm_num int)')
cur.execute('insert into owner_firm select ownerid,compid+1000000 from owner left join component using(ownerid)')
cur.execute('update owner_firm set firm_num=ownerid where firm_num is null')
cur.execute('drop table if exists compustat_merge')
cur.execute("""create table compustat_merge as select compustat.*,compustat_owner.ownerid,owner_firm.firm_num
from compustat left join compustat_owner using(gvkey,year)
left join owner_firm using(ownerid)""")
cur.execute('drop table if exists patent_merge')
cur.execute("""create table patent_merge as select patent_use.*,patent_owner.ownerid,owner_firm.firm_num
from patent_use left join patent_owner using(patnum)
left join owner_firm using(ownerid)""")
cur.execute('drop table if exists assignment_merge')
cur.execute("""create table assignment_merge as select assignment_use.*,assignment_owner.assigneeid,assignment_owner.assignorid,assignee_firm.firm_num as dest_fn,assignor_firm.firm_num as source_fn
from assignment_use left join assignment_owner on assignment_use.assignid=assignment_owner.assignid
left join owner_firm as assignee_firm on assignment_owner.assigneeid=assignee_firm.ownerid
left join owner_firm as assignor_firm on assignment_owner.assignorid=assignor_firm.ownerid""")
# aggregate to yearly
cur.execute('drop table if exists patent_basic')
cur.execute('create table patent_basic (patnum int primary key, firm_num int, fileyear int, grantyear int, classone int, classtwo int)')
cur.execute('insert into patent_basic select patnum,firm_num,strftime(\'%Y\',filedate),strftime(\'%Y\',grantdate),classone,classtwo from patent_merge')
cur.execute('drop table if exists assignment_info')
cur.execute('create table assignment_info (assignid int primary key, patnum int, source_fn int, dest_fn int, execyear int, recyear int, grantyear int, fileyear int, classone int, classtwo int)')
cur.execute('insert into assignment_info select assignid,patnum,source_fn,dest_fn,substr(execdate,1,4),substr(recdate,1,4),strftime(\'%Y\',grantdate),strftime(\'%Y\',filedate),classone,classtwo from assignment_merge')
cur.execute('drop table if exists assignment_bulk')
cur.execute('create table assignment_bulk (source_fn int, dest_fn int, execyear int, ntrans int)')
cur.execute('insert into assignment_bulk select source_fn,dest_fn,execyear,count(*) from assignment_info group by source_fn,dest_fn,execyear')
con.commit()
@autodb(db_fname)
def get_names(con,cur,olist=[]):
return cur.execute('select * from owner where ownerid in (%s)' % ','.join(map(str,olist))).fetchall()
# add in a few corp reductions, esp kabushiki kaisha