forked from iamlemec/fastpat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
firm_cites.py
68 lines (55 loc) · 4.18 KB
/
firm_cites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# match citation data with aggregated firm data (to be run before patents_match_merge.py)
import sqlite3
import pandas as pd
# load in a lot of data
db_fname = 'store/patents.db'
con = sqlite3.connect(db_fname)
print 'Loading from database'
datf_cite = pd.read_sql('select citer,citee from citation',con)
datf_grant = pd.read_sql('select patnum,firm_num,fileyear from patent_basic',con)
# datf_trans = pd.read_sql('select assignid,patnum,source_fn,dest_fn,execyear from assignment_info',con)
print 'Matching with patents'
# match citations to firms with patnum
datf_cite.rename(columns={'citer':'citer_pnum','citee':'citee_pnum'},inplace=True)
datf_cite = datf_cite.merge(datf_grant,how='left',left_on='citer_pnum',right_on='patnum',suffixes=('','_citer'))
datf_cite = datf_cite.drop(['patnum'],axis=1).rename(columns={'firm_num':'citer_fnum','fileyear':'cite_year'})
datf_cite = datf_cite.merge(datf_grant[['patnum','firm_num']],how='left',left_on='citee_pnum',right_on='patnum',suffixes=('','_citee'))
datf_cite = datf_cite.drop(['patnum'],axis=1).rename(columns={'firm_num':'citee_fnum'})
datf_cite['self_cite'] = (datf_cite['citer_fnum'] == datf_cite['citee_fnum'])
print 'Aggregating together'
# patent level statistics
n_cited = datf_cite.groupby('citer_pnum').size()
n_citing = datf_cite.groupby('citee_pnum').size()
n_self_cited = datf_cite.groupby('citer_pnum')['self_cite'].sum()
datf_cite_stats = pd.concat([pd.Series(n_cited,name='n_cited'),pd.Series(n_citing,name='n_citing'),pd.Series(n_self_cited,name='n_self_cited')],axis=1).reset_index().rename(columns={'index':'patnum'})
# firm level statistics
datf_cite_year = datf_cite.groupby(['citer_fnum','citee_fnum','cite_year']).size().reset_index(name='ncites')
# first_cite_agg = pd.Series(datf_cite_year.groupby(['citer_fnum','citee_fnum'])['cite_year'].min(),name='first_cite')
# ncites_agg = pd.Series(datf_cite_year.groupby(['citer_fnum','citee_fnum'])['ncites'].sum(),name='ncites')
# datf_cite_agg = pd.concat([first_cite_agg,ncites_agg],axis=1).reset_index()
# determine whether acquiring firm cites selling firm
# datf_trans_firm = datf_trans.merge(datf_cite_agg[['citer_fnum','citee_fnum','first_cite','ncites']],how='left',left_on=['dest_fn','source_fn'],right_on=['citer_fnum','citee_fnum'])
# datf_trans_firm = datf_trans_firm.drop(['citer_fnum','citee_fnum'],axis=1).rename(columns={'first_cite':'dest_first_cite','ncites':'dest_ncites'})
# datf_trans_firm = datf_trans_firm.merge(datf_cite_agg[['citer_fnum','citee_fnum','first_cite','ncites']],how='left',left_on=['source_fn','dest_fn'],right_on=['citer_fnum','citee_fnum'])
# datf_trans_firm = datf_trans_firm.drop(['citer_fnum','citee_fnum'],axis=1).rename(columns={'first_cite':'source_first_cite','ncites':'source_ncites'})
# datf_trans_firm = datf_trans_firm.fillna({'dest_ncites':0,'source_ncites':0})
# determine whether patent was cited by acquiring firm
# datf_trans_pat = datf_trans.merge(datf_cite[['citer_fnum','citer_pnum','citee_pnum','cite_year']],how='left',left_on=['dest_fn','patnum'],right_on=['citer_fnum','citee_pnum'])
# datf_trans_pat = datf_trans_pat.drop(['citer_fnum','citee_pnum'],axis=1)
# ncites_before = pd.Series(datf_trans_pat[datf_trans_pat['cite_year']<=datf_trans_pat['execyear']].groupby('assignid').size(),name='ncites_before')
# ncites_after = pd.Series(datf_trans_pat[datf_trans_pat['cite_year']>datf_trans_pat['execyear']].groupby('assignid').size(),name='ncites_after')
# datf_trans_pat = datf_trans_pat.drop(['citer_pnum','cite_year'],axis=1).drop_duplicates()
# datf_trans_pat = datf_trans_pat.join(ncites_before,on='assignid')
# datf_trans_pat = datf_trans_pat.join(ncites_after,on='assignid')
# datf_trans_pat = datf_trans_pat.fillna({'ncites_before':0,'ncites_after':0})
print 'Writing to database'
# save frames back to sql - need to to this one at a time for memory reasons
# datf_cite.to_sql('firm_cite',con,if_exists='replace')
# datf_cite_stats.to_sql('cite_stats',con,if_exists='replace')
# datf_cite_year.to_sql('firm_cite_year',con,if_exists='replace')
# datf_cite_agg.to_sql('firm_cite_agg',con,if_exists='replace')
# datf_trans_firm.to_sql('trans_cite_firm',con,if_exists='replace')
# datf_trans_pat.to_sql('trans_cite_pat',con,if_exists='replace')
# close out
con.commit()
con.close()