-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchirpScraper.py
executable file
·100 lines (93 loc) · 3.49 KB
/
chirpScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#! /usr/bin/env python3
# encoding: utf-8
import time
import random
import requests
from bs4 import BeautifulSoup
class Scrape(object):
'''
Scrape class scrapes the the CHIRP Radio website
for a DJs playlist.
'''
def __init__(self, dj_uri, most_recent=True):
self.dj_uri = dj_uri
self.most_recent = most_recent
def get_tracks(self):
'''Returns a list of dictionaries
'''
try:
return self.make_soup()
except self.ScrapeError as e:
print(e.args)
return 1
def get_dj_html(self, uri):
'''
Gets the HTML for a specified DJ's URI
'''
try:
r = requests.get(uri)
if r.status_code != 200:
raise self.ScrapeError('HTTP status code: {}'.format(r.status_code))
return r.text
except self.ScrapeError as e:
print(e.args)
return 1
def make_soup(self):
'''
Takes the HTML output and parses the data with
BeautifulSoup
'''
try:
tracks = []
uri = self.dj_uri
html = self.get_dj_html(uri)
soup = BeautifulSoup(html, 'html.parser')
seed_table = soup.find('table')
seed_table_rows = seed_table.find_all('tr')
seed_datetime_raw = seed_table_rows[0].find('td', attrs={'class':'date-heading'})
seed_datetime = seed_datetime_raw.text.strip()
seed_date = seed_datetime.split('-')[0].strip()
while uri:
html = self.get_dj_html(uri)
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')
rows = table.find_all('tr')
date_match = True
for row in rows:
artist = row.find('td', attrs={'class':'artist'})
song = row.find('td', attrs={'class':'track'})
album = row.find('td', attrs={'class':'album'})
if artist and song and album:
if '<mark>Local</mark>' in str(artist):
artist = artist.text.strip().rstrip('Local')
else:
artist = artist.text.strip()
tracks.append({'artist': artist,
'song': song.text.strip(),
'album': album.text.strip()})
if self.most_recent:
found_datetime_raw = row.find('td', attrs={'class':'date-heading'})
if found_datetime_raw:
found_datetime = found_datetime_raw.text.strip()
found_date = found_datetime.split('-')[0].strip()
if seed_date != found_date:
date_match = False
break
if date_match == False:
uri = False
else:
pages = soup.find('ol', attrs={'class':'pagination'})
try:
uri = pages.find('a', attrs={'class':'next'})['href']
except:
uri = False
time.sleep(random.random())
return tracks
except self.ScrapeError as e:
print(e.args)
return 1
class ScrapeError(Exception):
'''
Passes errors
'''
pass