-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathRightMove_Super_Scraper.py
309 lines (246 loc) · 13.6 KB
/
RightMove_Super_Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/usr/bin/env python
# Dependencies
from lxml import html, etree
#url management library
import requests
#Data management library
import pandas as pd
#Time management library
import datetime as dt
# Database Library
import numpy as np
# open floorplan
import io
import pytesseract
from PIL import Image
class rightmove_data(object):
"""The rightmove_data web scraper works by implementing an instance of the class
on the URL returned by a search on the rightmove website. Go to rightmove.co.uk
and search for whatever you want, then create an instance of the class on the URL
returned by the search. The class returns an object which includes various
methods for extracting data from the search results, the most useful being the
.get_results() method which returns all results as a pandas DataFrame object.
"""
def __init__(self, url):
# FG the self variable represents the instance of the object itself.
# FG declared explicitly
# FG __init__ method represents a constructor in Python.
# FG call rightmove_data() Python creates an object and passes it as the first parameter to the __init__ method.
# FG Any additional parameters (url) will also get passed as argument
self.url = url
# determine if the url is a property search result. works for rent and sale
try:
if "searchType=SALE" in self.url or "property-for-sale" in self.url:
self.rent_or_sale = "SALE"
elif "searchType=RENT" in self.url or "property-to-rent" in self.url:
self.rent_or_sale = "RENT"
except ValueError:
print("Not a valid rightmove search URL.")
# the number of search results
self.results_count = self.__results_count()
# the number of pages to get all the results
self.result_pages_count = self.__result_pages_count()
def __results_count(self):
"""Returns an integer of the total number of results returned by the search URL."""
# FG pind the url for information
page = requests.get(self.url)
# FG Get the content of the hhtp/url and parse it using the html module
tree = html.fromstring(page.content)
# FG
xp_result_count = """//span[@class="searchHeader-resultCount"]/text()"""
# FG query in tree for the result count on the page
return int(tree.xpath(xp_result_count)[0].replace(",", ""))
def __result_pages_count(self):
"""Returns the number of result pages returned by the search URL.
There are 24 results on each results page, but note that the
rightmove website limits results pages to a maximum of 42 pages."""
page_count = self.results_count // 24
if self.results_count % 24 > 0:
page_count += 1
# Rightmove will return a maximum of 42 results pages, hence:
if page_count > 42: page_count = 42
return page_count
def __get_individual_info(self,property_url):
"""This is a hidden method to scrape the data from the individual property page
It is used iteratively by the .get_results() and _get_page_results
"""
# Set the xpaths for the description
xp_keyfeatures = """//div[@class="sect key-features"]\
//ul[@class="list-two-col list-style-square"]//li/text()"""
xp_desc = """//div[@class="sect "]\
//p[@itemprop="description"]/text()"""
xp_firstlisted = """//div[@id="firstListedDateValue"]/text()"""
# Set the xpaths for the geo location of the property
xp_latlong = """//div[@class="pos-rel"]\
//a[@class="block js-tab-trigger js-ga-minimap"]\
//img/@src"""
# set the path for the floorplan
xp_floorplan = """//div[@class="zoomableimagewrapper"]\
//img/@src"""
# Use the requests library to get the whole web page.
page = requests.get(property_url)
# Process the html.
tree = html.fromstring(page.content)
# Create data lists from Xpaths.
fd = tree.xpath(xp_firstlisted)
if len(fd)!=0 : firstd =fd[0]
else: firstd = fd
firstlisted = pd.to_datetime(firstd, dayfirst=True,format= '%d %B %Y',errors='ignore')
# any text describing the property
list_key_features = tree.xpath(xp_keyfeatures)
keyfeatures = ' '.join(list_key_features)
keyfeatures = keyfeatures.replace('/',' ')
desc = tree.xpath(xp_desc)
description = ' '.join(desc)
#the full text available to mininge
description = description+' '+ keyfeatures
#the unique words in the description
descWords =list(set(description.replace('/',' ').replace('.','').replace('\'','').replace(',','').lower().split()))
#try to find the square footage
regex = r'(?<= )(\d\d*[,\.]?\d+)+(?=[ .]?sq)'
listsqftage=re.findall(regex,description.lower())
f=[float(i.replace(',',''))for i in listsqftage]
if len(f)!=0 : sqftage=float(max(f))
else: sqftage = f
# find square footage in the floorplan
floorplan_urls = tree.xpath(xp_floorplan)
if len(floorplan_urls)!=0 :
floorplan_url =floorplan_urls[0]
# print(floorplan_url)
response = requests.get(floorplan_url)
#print( type(response) ) # <class 'requests.models.Response'>
flrpln = Image.open(io.BytesIO(response.content))
#print( type(flrpln) ) # <class 'PIL.JpegImagePlugin.JpegImageFile'>
text = pytesseract.image_to_string(flrpln)
print( text )
else: floorplan_url = floorplan_urls
# Latitute longitude
latlongs = tree.xpath(xp_latlong)
latlong=' '.join(latlongs)
regexlat = r'(?<=latitude=)(-?\d\d*[\.]?\d+)'
regexlon = r'(?<=longitude=)(-?\d\d*[\.]?\d+)'
lat=re.findall(regexlat,latlong)
if len(lat)!=0 : latitude=lat[0]
else: latitude = lat
lon = re.findall(regexlon,latlong)
if len(lon)!=0 : longitude= lon[0]
else: longitude = lon
return [latitude , longitude , descWords , sqftage , firstlisted]
def __get_page_results(self,page_url):
"""This is a hidden method to scrape the data from a single page
of search results. It is used iteratively by the .get_results()
method to scrape data from every page returned by the search."""
# Set the correct xpath for the price.
if self.rent_or_sale == "RENT":
xp_prices = """//span[@class="propertyCard-priceValue"]/text()"""
elif self.rent_or_sale == "SALE":
xp_prices = """//div[@class="propertyCard-priceValue"]/text()"""
# Set the xpaths for listing title, property address,
# listing URL, and agent URL.
xp_titles = """//div[@class="propertyCard-details"]\
//a[@class="propertyCard-link"]\
//h2[@class="propertyCard-title"]/text()"""
xp_addresses = """//address[@class="propertyCard-address"]//span/text()"""
xp_layus = """//div[@class="propertyCard-description"]\
//a[@class="propertyCard-link"]//span/text()"""
xp_addedon = """//div[@class="propertyCard-detailsFooter"]\
//div[@class="propertyCard-branchSummary"]\
//span[@class="propertyCard-branchSummary-addedOrReduced"]/text()"""
xp_weblinks = """//div[@class="propertyCard-details"]\
//a[@class="propertyCard-link"]/@href"""
# Use the requests library to get the whole web page.
page = requests.get(page_url)
# Process the html.
tree = html.fromstring(page.content)
# Create data lists from Xpaths.
price_pcm = tree.xpath(xp_prices)
titles = tree.xpath(xp_titles)
addresses = tree.xpath(xp_addresses)
laius = tree.xpath(xp_layus)
addedon = tree.xpath(xp_addedon)
urlbase = "http://www.rightmove.co.uk"
weblinks = ["{}{}".format(urlbase, tree.xpath(xp_weblinks)[val]) \
for val in range(len(tree.xpath(xp_weblinks)))]
######### obtain the supplementary info on individual pages
latitude =[]
longitude = []
descwords=[]
sqftage=[]
firstlisted=[]
i=0
for weblink in weblinks:
if weblink != urlbase:
info = self.__get_individual_info(weblink)
latitude.append(info[0])
longitude.append(info[1])
descwords.append(info[2])
sqftage.append(info[3])
firstlisted.append(info[4])
# type here the floorplan retrieval and processsing
# Store the data in a temporary pandas DataFrame.
data = [price_pcm, titles, addresses, laius, addedon , weblinks , latitude , longitude , descwords , sqftage , firstlisted]
temp_df = pd.DataFrame(data)
temp_df = temp_df.transpose()
temp_df.columns = ["price", "type", "address", "short desc", "listing date",'url','latitude','longitude', 'descwords','sqftage','firstlisted' ]
# Drop empty rows which come from placeholders in the html.
temp_df = temp_df[temp_df["address"].notnull()]
return temp_df
def get_results(self):
"""Returns a pandas DataFrame with all results returned by the search."""
# Create DataFrame to store results.
full_results = pd.DataFrame(columns={"price", "type", "address", "short desc", "listing date",'url','latitude','longitude', 'descwords','sqftage','firstlisted' })
# Iterate through pages of results, using the .__get_page_results method to scrape results.
for page in range(0, self.result_pages_count+1, 1):
# Create the URL of the specific results page.
iteration_url = "{}{}{}".format(str(self.url), "&index=", str((page*24)))
# Create a temporary dataframe of the page results.
temp_df = self.__get_page_results(iteration_url)
# Concatenate the temporary dataframe with the full dataframe.
frames = [full_results, temp_df]
full_results = pd.concat(frames)
# Reset the index.
full_results = full_results.reset_index(drop=True)
# Convert price column to numeric type.
full_results.price.replace(regex=True, inplace=True, to_replace=r"\D", value=r"")
full_results.price = pd.to_numeric(full_results.price)
# Extract postcodes to a separate column.
full_results["postcode"] = full_results["address"].str.extract\
(r"\b([A-Za-z][A-Za-z]?[0-9][0-9]?[A-Za-z]?)\b", expand=True)
# Extract creation/modification date to a separate column.
date_cond = [full_results["listing date"].str.contains("today"),
full_results["listing date"].str.contains("yesterday"),
(~full_results["listing date"].str.contains("today") & ~full_results["listing date"].str.contains("yesterday"))]
date_choices = [ dt.date.today().strftime( '%d/%m/%Y'),
(dt.date.today()-dt.timedelta(1)).strftime( '%d/%m/%Y'),
pd.to_datetime(full_results["listing date"].str[-10:],
dayfirst=True,format= '%d/%m/%Y',errors='ignore')]
full_results['date'] = np.select(date_cond, date_choices)
full_results["date_type"] = full_results["listing date"].str[:-10]
full_results["date_Type"] = full_results["date_type"].str.strip()
# Extract RM property ID
full_results["RM_ID"] = full_results["url"].str[54:-5]
#######Below we extract underlying information from type
# Extract number of bedrooms from "type" to a separate column.
full_results["number_bedrooms"] = full_results.type.str.extract(r"\b([\d][\d]?)\b", expand=True)
full_results.loc[full_results["type"].str.contains("studio", case=False), "number_bedrooms"]=0
full_results["number_bedrooms"]=full_results["number_bedrooms"].astype(float)
# House, flat, detached, semi detached terraced penthouse duplex triplex
full_results['flat'] = full_results.type.str.contains('apartment|flat|plex|maisonette|penthouse')
full_results['house'] = ~full_results.type.str.contains('apartment|flat|plex|maisonette|penthouse|land|plot')
full_results['detached'] = full_results.type.str.contains("detached") & ~full_results.type.str.contains("semi")
full_results['semi-d'] = full_results.type.str.contains('semi')
full_results['penthouse'] = full_results.type.str.contains('penthouse')
full_results['duplex'] = full_results.type.str.contains('plex')
full_results['land'] = full_results.type.str.contains('land|plot')
full_results['offplan'] = full_results.type.str.contains('off-plan')
# Clean up annoying white spaces and newlines in "type" column.
for row in range(len(full_results)):
type_str = full_results.loc[row, "type"]
clean_str = type_str.strip("\n").strip()
full_results.loc[row, "type"] = clean_str
# Add column with datetime when the search was run (i.e. now).
now = dt.datetime.today()
full_results["search_date"] = now
# Remove superfluous columns and data
full_results = full_results.drop(['listing date'], axis= 1)
return full_results