This repository has been archived by the owner on Aug 2, 2024. It is now read-only.
forked from natgaertner/candidate_classifier
-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_fb_links.py
549 lines (467 loc) · 18.2 KB
/
get_fb_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
import urllib2 as ul
import re
import csv
import json
import time
import random
import sys
import chardet
import conversions
import requests
from state_map import state_map
from multiprocessing import Pool, Lock
def getlinks(candidate, webpage, state, district_type, district_name):
"""
Gets all the facebook links found via the Google Search API
"""
# ### Cleanup input variables
# District
district_type = district_type.replace('_', ' ').strip()
district_type = '+'.join(district_type.split(' '))
district_name = '+'.join(district_name.strip().split(' '))
# State
state = state_map[state.strip()]
state = '+'.join(state.split(' '))
# Candidate name
candidate, last, first = conversions.clean_name(candidate)
candidate = '+'.join(candidate.split(' '))
#print 'CANDIDATE: {}'.format(candidate)
# Setup search urls
search_urls = []
extra_children_searches = []
precise_searches = []
# Common values
url = "https://www.googleapis.com/customsearch/v1"
cx = "011743744063680272768:cp4-iesopjm"
key = "AIzaSyCdHlGJuMzGBH9hNsEMObffDIkzJ44EQhA"
search_urls.append(
u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}'.format(
url=url, cx=cx, key=key, name=candidate, state=state
)
)
# Just searches for general about pages
extra_children_searches.append(
u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+info'.format(
url=url, cx=cx, key=key, name=candidate, state=state
)
)
# sk=info specifies Facebook's about page
extra_children_searches.append(
u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+sk=info'.format(
url=url, cx=cx, key=key, name=candidate, state=state
)
)
precise_searches.append(
u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+campaign'.format(
url=url, cx=cx, key=key, name=candidate, state=state
)
)
precise_searches.append(
u'{url}?cx={cx}&key={key}&hl=en&q={name}+{state}+elect'.format(
url=url, cx=cx, key=key, name=candidate, state=state
)
)
# Clean up encoding of URL's
search_urls = [
s.encode(
chardet.detect(s.encode('utf-8'))['encoding']
) for s in search_urls
]
extra_children_searches = [
s.encode(
chardet.detect(s.encode('utf-8'))['encoding']
) for s in extra_children_searches
]
#print 'SEARCH_URLS: {}'.format(search_urls)
precise_searches = [
s.encode(
chardet.detect(s.encode('utf-8'))['encoding']
) for s in precise_searches
]
# This must be a test for a dummy webside used for testing
# get_redirect simply gets the final page that returns a 200
old_webpage = webpage
if webpage != 'www.gernensamples.com':
webpage = conversions.get_redirect(webpage)
#print 'WBBPAGES: {}'.format(webpage)
has_webpage = True
# raise Exception # why do we need this exception??
# print 'ok?'
# Cleanup web pages by removing protocol, subdomain, and trailing '/'
if has_webpage:
#print has_webpage
webpage_stripped = re.match(
r'(?:https?://)?(?:www\.)?(?P<content>.+)', webpage
).groupdict()['content'].rstrip('/')
old_webpage_stripped = re.match(
r'(?:https?://)?(?:www\.)?(?P<content>.+)', old_webpage
).groupdict()['content'].rstrip('/')
# TODO strip queries
webpage_no_queries = ul.urlparse.urlparse(webpage)
webpage_no_queries = re.match(
r'(?:www\.)?(?P<content>.+)',
webpage_no_queries.netloc + webpage_no_queries.path
).groupdict()['content'].rstrip('/')
old_webpage_no_queries = ul.urlparse.urlparse(old_webpage)
#print 'NO:{}'.format(old_webpage_no_queries)
if old_webpage_no_queries is not None:
old_webpage_no_queries = re.match(
r'(?:www\.)?(?P<content>.+)',
old_webpage_no_queries.netloc + old_webpage_no_queries.path
).groupdict()['content'].rstrip('/')
patt = re.compile(
r'^https?://(?:www.)?{webpage}/?$'.format(
webpage=webpage_stripped.lower()
)
)
old_patt = re.compile(
r'^https?://(?:www.)?{webpage}/?$'.format(
webpage=old_webpage_stripped.lower()
)
)
child_patt = re.compile(
r'^https?://(?:www\.)?{webpage}.+'.format(
webpage=webpage_no_queries.lower()
)
)
old_child_patt = re.compile(
r'^https?://(?:www\.)?{webpage}.+'.format(
webpage=old_webpage_no_queries.lower()
)
)
print 'starting'
n = 4
while True:
results = map(lambda x: json.loads(requests.get(x).text), search_urls)
#for r in results:
# print 'error' in r
if any(map(
lambda r: ('error' in r and (
r['error']['code'] == 403 or r['error']['code'] == 503)
), results)):
print 'sleeping'
time.sleep(n + random.randint(1, 1000)/1000.)
n = n*2
elif any(map(lambda r: 'error' in r, results)):
raise Exception(', '.join(
map(
lambda r: r['error']['message'],
filter(lambda r: 'error' in r, results)
)
))
else:
break
n = 4
while True:
child_results = map(
lambda x: json.loads(requests.get(x).text),
extra_children_searches
)
if any(map(
lambda r: 'error' in r and (
r['error']['code'] == 403 or r['error']['code'] == 503
), child_results)):
print 'sleeping'
time.sleep(n + random.randint(1, 1000) / 1000.)
n = n * 2
elif any(map(
lambda r: 'error' in r, child_results
)):
raise Exception(', '.join(
map(
lambda r: r['error']['message'],
filter(lambda r: 'error' in r, child_results)
)
))
else:
break
n = 4
while True:
precise_results = map(
lambda x: json.loads(requests.get(x).text), precise_searches
)
if any(map(
lambda r: 'error' in r and (
r['error']['code'] == 403 or r['error']['code'] == 503
), precise_results)):
print 'sleeping'
time.sleep(n + random.randint(1, 1000) / 1000.)
n = n * 2
elif any(map(lambda r: 'error' in r, precise_results)):
raise Exception(', '.join(
map(
lambda r: r['error']['message'],
filter(lambda r: 'error' in r, precise_results)
)
))
else:
break
if type(results) != list:
results = [results]
# Get results from the "items" key and store it in the results variable
real_results = [
(r if 'items' in r else {'items': []}) for r in results
]
results = real_results
# print 'RESULTS:{}'.format(results)
# Get the result URLs, Extract searchable text from the pagemap
search_links = [[i['link'].lower() for i in r['items']] for r in results]
search_text = [
[u'{title} {link} {pagemap} {snippet}'.format(
**convert_pagemap_dict(i)
).lower().encode('utf-8') for i in r['items']] for r in results
]
# first loop may be unneccessary
for ri in range(len(search_links)): # for 1 to number of result objects
for si in range(len(search_links[ri])): # for 1 to number of links
# For each "precise result" (name+state+'elect'),
# see if the link is equivalent
# or a sub page of the main results (name+state)
for r in precise_results:
if 'items' in r:
for i in r['items']:
if conversions.child_or_equal_page(
search_links[ri][si], i['link'].lower(), True
):
search_text[ri][si] += ' bipspecialappearsinprecise' # noqa
# Get the result URLs, Extract searchable text from the pagemap
child_links = [
i['link'].lower() for r in child_results if 'items' in r
for i in r['items']
]
child_text = [
u'{title} {link} {pagemap} {snippet}'.format(
**convert_pagemap_dict(i)
).lower().encode('utf-8') for r in child_results if 'items' in r
for i in r['items']
]
# Classify each search link based on it's relationship
# to the provided web page, either PARENT, CHILD, TRUE (identity),
# or FALSE (no match)
search_class = [
map(lambda s: conversions.page_relation(
s, True, webpage, old_webpage
), sl) for sl in search_links
]
# TODO Clean up ssv code
# Seems to match each search link result against the webpage domain
ssv = [
any(map(patt.match, sl)) or any(map(old_patt.match, sl))
for sl in search_links
]
non_websites = [
[i['link'] for i in r['items'] if webpage not in i['link']]
for r in results
]
cs, ct, cc = zip(
*[combine_children(
search_links[i], search_text[i], search_class[i],
child_links, child_text
) for i in range(len(search_links))]
)
print 'got there', len(results[0]['items'])
return (non_websites, ssv, webpage_stripped, search_links, search_text,
[r['items'] for r in results], search_class, cs, ct, cc,
child_links, child_text)
# CONSTANTS
classes = ('ParentCombined', 'TrueCombined', 'ChildCombined', 'FalseCombined',
'Parent', 'True', 'Child', 'False')
class_ranks = dict((classes[i], i) for i in range(len(classes)))
youtube_accept = ['title', 'description']
# HELPER METHODS
def class_order(cls1, cls2):
"""
Determines the order distance between two classes
"""
return class_ranks[cls2]-class_ranks[cls1]
def combine_children(websites, texts, classes, child_links, child_text):
"""
Combines child sites into a group under the main parent site,
and returns a tuple consisting of the parent site, the parent class,
and the combined text from all child sites.
"""
combined_sites = {'websites': [], 'texts': [], 'classes': []}
# root_sites = []
# temp_root_sites = {}
# Create tuples of each search link with its class
# (i.e. parent, child, identical, no-match)
sites_classes = zip(websites, classes)
# Loop over each site and combine
for site, cls in sites_classes:
# This creates a group object that contains
# a list of tuples for every search result
# which is a child, parent, or equal of another search result
# for this candidate
group = filter(
lambda s: class_order(
conversions.page_relation(s[0], True, site), 'False'
) > 0,
zip(websites, classes, texts)
)
# Get the lowest ranked classes, which at this point should be
# parents
try:
min_tuple = min(group, key=lambda g: class_ranks[g[1]])
except:
import pdb
pdb.set_trace()
# Setup the '{class}combined' group
group_class = min_tuple[1]+'Combined'
group_site = min_tuple[0]
group_children_text = []
# Create a tuple of child pages with their text
child_site_texts = zip(child_links, child_text)
# Loop over each child site and determine if it is
# a child of the group_site website. If so, append its info
# to the group_children_text list and remove it from the list
# of child sites
for child, ctext in child_site_texts:
if conversions.child_or_equal_page(group_site, child, True):
group_children_text.append(ctext)
child_idx = child_links.index(child)
child_links.pop(child_idx)
child_text.pop(child_idx)
child_site_texts.pop(child_idx)
# Add this site to the combined_sites dict
combined_sites['websites'].append(group_site)
combined_sites['classes'].append(group_class)
group_text = ' '.join(group_children_text)
# Remove each site in this group from the various lists
for g in group:
site_idx = websites.index(g[0])
websites.pop(site_idx)
classes.pop(site_idx)
sites_classes.pop(site_idx)
texts.pop(site_idx)
# Add the group's text to to the combined_sites dict
group_text += ' ' + ' '.join(map(lambda g: g[2], group))
combined_sites['texts'].append(group_text)
return (combined_sites['websites'], combined_sites['texts'],
combined_sites['classes'])
def extract_pagemap_text(pagemap, text='', youtube=False):
"""
Extracts useable text for searching from
Google's Pagemap object
"""
if type(pagemap) == list:
for item in pagemap:
text = extract_pagemap_text(item, text, youtube)
return text
elif type(pagemap) == dict:
for k, v in pagemap.iteritems():
if youtube and k not in youtube_accept:
continue
if k == 'videoobject':
text = extract_pagemap_text(v, text, True)
elif k == 'metatags' and 'videoobject' in pagemap:
text = extract_pagemap_text(v, text, True)
else:
text = extract_pagemap_text(v, text, youtube)
return text
elif type(pagemap) == str or type(pagemap) == unicode:
if not re.match(r'https?://.+', pagemap):
text += ' ' + pagemap
return text
def convert_pagemap_dict(item):
"""
Converts and item's pagemap to just the text
"""
d = {}
d.update(item)
if 'pagemap' in d:
d['pagemap'] = extract_pagemap_text(d['pagemap'])
else:
d['pagemap'] = ''
return d
lock = Lock()
def runit(l, uid):
#print '---------START----------------'
#print l.keys()
#print 'FACEBOOK: {}'.format(l['Facebook URL'])
#print l['name']
try:
non_webpage_list, search_success_vector, webpage, sl, st, items, sc, cs, ct, cc, child_links, child_text = getlinks(
l['Candidate Name'].decode('utf-8').strip(),
l['Facebook URL'].decode('utf-8').strip(),
l['State'].decode('utf-8').strip(),
l['type'].decode('utf-8').strip(),
l['name'].decode('utf-8').strip()
)
print 'UID:{}\nNON_WEBPAGE:{}\nSL:{}\nST:{}\nITEMS:{}\nSC:{}\nCS:{}\nCT:{}\nCC:{}\nCHILD LINKS:{}\nCHILD TEXT:{}\n\n'.format(uid,len(non_webpage_list[0]),len(sl[0]),len(st[0]),len(items[0]),len(sc[0]),len(cs[0]),len(ct[0]),len(cc[0]),len(child_links),len(child_text))
# print uid, len(non_webpage_list[0]), len(sl[0]), len(st[0]), \
# len(items[0]), len(sc[0]), len(cs[0]), len(ct[0]), len(cc[0]), \
# len(child_links), len(child_text)
except Exception as error:
import traceback
print traceback.format_exc()
print error
#print '-------ENDEXCEP------------------'
return uid, [], [], [], [], [], [], [], [], [], [], [], []
#print '---------END----------------'
return (uid, non_webpage_list, search_success_vector, webpage,
sl, st, items, sc, cs, ct, cc, child_links, child_text)
if __name__ == '__main__':
if '--full' in sys.argv:
full = 'full'
else:
full = ''
filename = sys.argv[1]
with open('fb/{filename}'.format(filename=filename), 'rU') as f,\
open('fb/non/{full}fbnonwebpages.csv'.format(full=full), 'a') as g,\
open('fb/non/{full}fbwebpage_ssv.csv'.format(full=full), 'a') as h,\
open('fb/{full}fbsearch_results.csv'.format(full=full), 'a') as k,\
open('fb/{full}fbsearch_results_combined.csv'.format(full=full), 'a') as m: # noqa
csvr = csv.DictReader(f)
csvw = csv.writer(g)
csvw2 = csv.writer(h)
csvw3 = csv.writer(k)
csvw4 = csv.writer(m)
#csvw.writerow(['uid', 'webpage', 'non_webpage_list'])
#csvw2.writerow(['uid', 'webpage', 'search_success_vector'])
#csvw3.writerow(['uid', 'link', 'class', 'sitetext', 'items'])
#csvw4.writerow(['uid', 'link', 'class', 'sitetext'])
search_rows_written = [0]
pool = Pool(processes=20)
def callb(results):
(uid, nwl, ssv, webpage, sl, st, items, sc, cs,
ct, cc, child_links, child_text) = results
lock.acquire()
csvw.writerow([uid, webpage, nwl])
csvw2.writerow([uid, webpage, ssv])
global csvw3
rotate = False
for i in range(len(sl)):
for j in range(len(sl[i])):
csvw3.writerow(
[uid, sl[i][j], sc[i][j], st[i][j], repr(items[i][j])]
)
search_rows_written[0] += 1
if search_rows_written[0] % 1000 == 0:
rotate = True
for i in range(len(child_links)):
csvw3.writerow(
[uid, child_links[i], 'Info', child_text[i], '']
)
if rotate:
# csvw3 = csv.writer(
# open('search_results{num}.csv'.format(
# num=search_rows_written[0]/1000),'w'))
pass
search_rows_written[0] = 0
for i in range(len(cs)):
for j in range(len(cs[i])):
csvw4.writerow([uid, cs[i][j], cc[i][j], ct[i][j]])
search_rows_written[0] += 1
if search_rows_written[0] % 1000 == 0:
rotate = True
if rotate:
# csvw3 = csv.writer(
# open('search_results{num}.csv'.format(
# num=search_rows_written[0]/1000),'w'))
pass
lock.release()
for l in csvr:
uid = l['UID']
pool.apply_async(runit, [l, uid], callback=callb)
pool.close()
pool.join()