-
Notifications
You must be signed in to change notification settings - Fork 2
/
align_artists.py
73 lines (60 loc) · 2.45 KB
/
align_artists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
# encoding: utf-8
"""
align_artists.py
Created by Benjamin Fields on 2011-10-27.
Copyright (c) 2011.
"""
import sys
import getopt
import simplejson
import urllib2
from time import sleep
from apikey import API_KEY
help_message = ''' <input_file> <output_file>
Basic alignment of the artists in the million song dataset against Musicmetric tracked artists.
input_file should be list of identifiers from the MSD of the sort available at:
http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/unique_artists.txt
and described at
http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/unique_artists.txt
the output file will be of the same form (using the seperator '<sep>') with an additional row on the end
containing semetric artist UUIDs. Note that output_file will only contain the entries for the ids that
matched the semetric API, those not found will be skipped.
'''
ID_LOOKUP_URL = "http://apib2.semetric.com/artist/musicbrainz:{mbzid}?token={key}"
class Usage(Exception):
def __init__(self, msg):
self.msg = msg
def main(argv=None):
if argv is None:
argv = sys.argv
try:
if len(argv) != 3:
raise Usage(help_message)
infile = argv[1]
outfile = argv[2]
except Usage, err:
print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
print >> sys.stderr, "\t for help use --help"
return 2
with open(outfile, 'w') as wh:
with open(infile) as fh:
for enartistid, mbzid, entrackid, name in (line.strip().split('<SEP>') for line in fh.readlines()):
sleep(0.5)#be polite.
try:
page = urllib2.urlopen(ID_LOOKUP_URL.format(mbzid=mbzid,key=API_KEY)).read()
resp_env = simplejson.loads(page)
if resp_env['success']:
semetric_id = resp_env['response']['id']
wh.write("{0}<SEP>{1}<SEP>{2}<SEP>{3}<SEP>{4}\n".format(enartistid,
mbzid,
entrackid,
name,
semetric_id))
print name,"has the mbzid:", mbzid, "found semetric id:", semetric_id
else:
print "couldn't find an id for", name, "mbzid:", mbzid
except Exception, err:
print "unable to process", name, "with mbzid", mbzid,"due to err:", err
if __name__ == "__main__":
sys.exit(main())