-
Notifications
You must be signed in to change notification settings - Fork 3
/
reconcile_tag_data.py
106 lines (93 loc) · 3.28 KB
/
reconcile_tag_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from os import listdir
import shutil
from os.path import isfile, join
from collections import Counter
import json
import csv
import arrow
#read total jobs count
jobs_count = {}
with open(join('result', "jobs_count.csv")) as counts_f:
jobs_reader = csv.reader(counts_f, delimiter=',')
for row in jobs_reader:
count = row[1]
date = row[0][4:14]
jobs_count[date] = int(count)
#read tags counts
onlyfiles = [f for f in listdir(join('result','tags')) if isfile(join('result','tags', f))]
onlyfiles.sort()
reconciled_tags = {} # <tag, [date,count]>
processed_data = []
#find the data from a year ago
utc_now = arrow.utcnow()
year_ago = utc_now.shift(years=-1)
print(year_ago)
year_ago_filename = None
for fname in onlyfiles:
if arrow.get(fname[9:19])>year_ago :
year_ago_filename = fname
print(fname)
break
year_ago_data = None
current_tags = None
for fname in onlyfiles:
with open(join('result','tags',fname)) as f:
data = json.load(f)
current_date = fname[9:19]
#TODO make num_jobs read from another file
current_tags = {"date": current_date, "num_jobs": jobs_count[current_date], "tags": []}
rank = 1
for k,v in data:
#we don't want insignificant terms , typos , etc..
if v > 1:
current_tags["tags"].append({
"tag": k,
"perc": "{0:.2f}".format((v*1.0/current_tags["num_jobs"])*100),
"rank": rank
})
rank +=1
processed_data.append(current_tags)
#keep a year ago data for calculating top movers and loosers
if fname == year_ago_filename:
year_ago_data = current_tags
with open(join('result', 'data.json'), 'w+') as f:
json.dump(processed_data, f)
#calculate top movers and losers
changes = []
for tag in current_tags["tags"]:
change = None
rank = None
for old_tag in year_ago_data["tags"]:
print()
#if "vue.js" == tag["tag"]:
# print(tag)
if old_tag["tag"] == tag['tag'] and old_tag["perc"]!="0.0":
change = ((float(tag["perc"]) - float(old_tag["perc"]))/float(old_tag["perc"]))*100.0
rank_change = old_tag["rank"] - tag["rank"]
rank = tag["rank"]
break
#if not found in last year data
#TODO better for 100% calculation
if change == None:
change = 100
rank = tag["rank"]
changes.append({"tag": tag["tag"], "rank": int(rank), "change": int(change), "rank_change": rank_change})
topChanges = {}
sortedlist = sorted(changes, key=lambda k: k['rank'])
#take from the top 25 tech
sortedTop25 = sorted(sortedlist[0:24], key=lambda k: k['change'])
winners = sortedTop25[-5:]
winners.reverse()
topChanges["top25"] = {'best': winners, 'worst': sortedTop25[0:5]}
#take from the top 50 tech
sortedTop50 = sorted(sortedlist[0:49], key=lambda k: k['change'])
winners = sortedTop50[-5:]
winners.reverse()
topChanges["top50"] = {'best': winners, 'worst': sortedTop50[0:5]}
#take from the ALL tech
sortedTopALL = sorted(sortedlist, key=lambda k: k['change'])
winners = sortedTopALL[-5:]
winners.reverse()
topChanges["topALL"]={'best': winners, 'worst':sortedTopALL[0:5]}
with open(join('result', 'top_changes.json'), 'w+') as f:
json.dump(topChanges, f)