-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathfeat_Uuid_Doc_Srce_id.py
66 lines (58 loc) · 2.19 KB
/
feat_Uuid_Doc_Srce_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -*- coding: utf-8 -*-
"""
This script will find all documents clicked historically by a user.
However for extra granularity we store spearately whether the traffic source of the doucment
was 'internal', 'social' or search. For each, it is stored in a separate FFM namespace.
We only store documents which occured at least 100 times in the events file.
The script shuld be executed with pypy for faster execution.
Code leveraged from rcarson @ https://www.kaggle.com/jiweiliu/outbrain-click-prediction/extract-leak-in-30-mins-with-small-memory
@author: darragh
"""
import csv, os, gzip
input_dir = os.getenv('INPUT', '../input')
# define distionaries
uuid_uid = {} # Map of uuids to numeric ids
uuid_ev = {} # a check for if the user exists in events
ctdoc = {} # check if the document occured at least 100 times.
for c, row in enumerate(csv.DictReader(gzip.open('cache/events.csv.gz'))):
if row['uuid'] != '':
uuid_ev[row['uuid']] = 1
uuid_uid[row['uuid']] = row['uid']
if row['document_id'] not in ctdoc:
ctdoc[row['document_id']] = 1
else:
ctdoc[row['document_id']] += 1
print('all docs : ' + str(len(ctdoc)))
ctdoc = { key:value for key, value in ctdoc.items() if value > 100 }
print('common docs > 100 : ' + str(len(ctdoc)))
count = 0
outfile = "cache/viewed_doc_trf_source.csv.gz"
filename = input_dir + '/page_views.csv.gz'
# filename = input_dir + '/page_views_sample.csv.gz' # comment this out locally
for c, row in enumerate(csv.DictReader(gzip.open(filename))):
if c % 1000000 == 0:
print (c, count)
if row['document_id'] not in ctdoc:
continue
if row['uuid'] not in uuid_ev:
continue
if uuid_ev[row['uuid']] == 1:
uuid_ev[row['uuid']] = set()
lu = len(uuid_ev[row['uuid']])
uuid_ev[row['uuid']].add(row['document_id'])
if lu != len(uuid_ev[row['uuid']]):
count += 1
# Delete output file if it already exists
try:
os.remove(outfile)
except OSError:
pass
# Open the file to write to
fo = gzip.open(outfile, 'w')
fo.write('uuid,doc_trf_ids\n')
for i in uuid_ev:
if uuid_ev[i] != 1:
tmp = list(uuid_ev[i])
fo.write('%s,%s\n' % (uuid_uid[i], ' '.join(tmp)))
del tmp
fo.close()