-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspurscommunity.py
144 lines (110 loc) · 3.86 KB
/
spurscommunity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
## Parser module
from bs4 import BeautifulSoup
import urllib, urllib2, cookielib, re
import stripper
import utilities
import objects
## Log in with session #####
def login(username, password):
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
login_data = urllib.urlencode({'login' : username, 'password' : password})
opener.open('http://spurscommunity.co.uk/index.php?login/login', login_data)
return opener
## Find link to current ITK thread
def get_current_itk_thread(opener):
resp = opener.open('http://spurscommunity.co.uk/index.php?forums/transfer-rumours.46/index.rss')
html = resp.read()
soup = BeautifulSoup(html)
link = soup.item.link.string
return link
## Parse current page
def parse_itk_thread(opener, link):
# print "Initializing parsing on page %s" % link
post_should_be_saved = False
resp = opener.open(link)
html = resp.read()
soup = BeautifulSoup(html)
# Find all posts
posts = soup.findAll("li", { "class" : "message" })
# Itterate every post
for post in posts:
poster = post.find('a', {'class': 'username'}) # Get user data
content = post.findAll('article') # Get posts content
ratings = post.findAll("ul", { "class" : "dark_postrating_outputlist" }) # Get ratings
# Iterate ratings in post (Should be own function)
for rating in ratings:
s = BeautifulSoup(str(rating))
for img in s.findAll('img'):
if 'Informative' in img['alt']: # Keep it if post is rated informative
post_should_be_saved = True
# Save post if informative (saving should be own function)
if(post_should_be_saved):
raw_user_data = utilities.fix_spurscommunity_url( str(poster) )
# print get_username( str(raw_user_data) )
# print get_user_url( str(raw_user_data) )
raw_content_data = utilities.fix_spurscommunity_url( str(content) )
# print get_context_url(raw_content_data)
# print get_context_content(raw_content_data)
# print get_context_poster(raw_content_data)
# print get_post_content( str(raw_content_data).strip('[]') )
# print ratings = get_post_ratings( str(ratings) )
#print stripper.strip_html( str(content).strip('[]') )
post_should_be_saved = False
# Get next page
next_page = get_next_page(soup)
if next_page:
parse_itk_thread( opener, 'http://www.spurscommunity.co.uk/' + next_page )
# return get_current_page(soup)
def get_current_page(soup):
current_page = soup.find('a', { 'class': 'currentPage'})
return current_page['href']
def get_next_page(soup):
page_nav = soup.find("div", {"class" : "PageNav"})
for a in page_nav.findAll('a', { 'class': 'text'}):
if 'Next' in a.string:
return a['href']
return ''
def get_username(html):
soup = BeautifulSoup(html)
return soup.a.string
def get_user_url(html):
soup = BeautifulSoup(html)
return soup.a.get('href')
def get_context_poster(html):
soup = BeautifulSoup(html)
poster = soup.find("div", { "class" : "attribution type" })
# poster = re.search( '>(.*) said:', str(poster) ).group(1)
poster = re.search('>(.*) said:', str(poster)).group(1)
if poster:
return poster
return None
def get_context_url(html):
soup = BeautifulSoup(html)
url = soup.find('a', { 'class': 'AttributionLink'})
if url:
return url['href']
return None
def get_context_content(html):
soup = BeautifulSoup(html)
return soup.blockquote.string
def get_post_content(html):
soup = BeautifulSoup(html)
# Remove quote
for tag in soup.find_all( "blockquote", { "class" : "quoteContainer" } ):
tag.replaceWith('')
#Remove username of quote
for tag in soup.find_all( "div", {"class" : "attribution type"} ):
tag.replaceWith('')
return str(soup)
def get_post_ratings(html):
soup = BeautifulSoup(html)
print '-----------------------------'
ratings = { '' : '' }
for tag in soup.find_all( "li" ):
try:
ratings.update( { tag.img['alt'] : tag.strong.string } )
except Exception:
pass
return ratings
# return soup