-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy path002_Scrape_info_for_each_condo.py
156 lines (129 loc) · 6.8 KB
/
002_Scrape_info_for_each_condo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# -*- coding: utf-8 -*-
"""
*** package bs4 is needed for this script.
This script will:
1. load all links from 'condo_links_all.txt', store as a list
2. extract attributes for each link using retrieve function
3. retrieve function will check if there is any historical data available or not
if there is no historical data, it will skip to next link (line # 41)
4. 5 seconds sleep time was set between each request (line # 126)
5. save output as 'df_completed.csv'
"""
# import packages
from datetime import datetime
import time
import pandas as pd
import pickle as pk
from bs4 import BeautifulSoup
import requests
import os
os.chdir(r"D:\GitHub_Personal\2019-01-Web-Scraping-using-selenium-and-bs4")
# open the output text file
with open('condo_links_all.txt') as f:
condo_links_all = f.read().splitlines()
print(len(condo_links_all))
##############################################################################
# Write function to retrive info, using bs4.
# This process took some time to carefully extract the info you needed from the soup.
def retrieve(link):
page = requests.get(link)
print(link)
soup = BeautifulSoup(page.content, 'html.parser')
graph_data=soup.find(id="graph1").get_text().strip()
# If there is no price chart in the page, return 'None', skip the listing
if graph_data != "Not enough data to build the graph":
name=soup.find(class_="breadcrumb").findAll('span')[2].get_text()
district=soup.find(class_="breadcrumb").findAll('span')[1].get_text()
latitude=str(soup.find(itemprop="latitude")).split("=")[1].split()[0]
longitude=str(soup.find(itemprop="longitude")).split("=")[1].split()[0]
description=str(soup.find(class_="property-description__content"))
year_built=soup.find(class_="project-header-year").find('span').get_text()
proj_area=soup.find(class_="project-header-area").find('span').get_text().split()[0]
nbr_buildings=soup.find(class_="project-header-tower").find('span').get_text()
nbr_floors=soup.find(class_="project-header-floor").find('span').get_text()
units=description.split("units")[0].split()[-1]
print(name,district,latitude,longitude,"\n",year_built,proj_area,nbr_buildings,nbr_floors,units)
neighborhood=[]
for i in range(0,15):
x=soup.find(class_="property-description__content").findAll('li')[i].get_text()
neighborhood.append(x)
shops=neighborhood[0:5]
#for x in shops: print(x)
schools=neighborhood[5:10]
#for x in schools: print(x)
restaurants=neighborhood[10:15]
#for x in restaurants: print(x)
hospital=soup.find(class_="property-description__content").findAll('p')[-3].get_text()
#print(hospital)
# Amenities section
# Elevator,Parking,Security,CCTV,Pool,Sauna,Gym,Garden,Playground,Shop,Restaurant,Wifi
amenities=[]
for i in range(0,12):
if ('yes' in str(soup.find(class_="amenities").findAll('li')[i])):
amenities.append(1)
else:
amenities.append(0)
#print(amenities)
# Location and Neighborhood
transportation=[]
for i in range(0,5):
tran_type=soup.findAll(class_="media neighborhood-destination")[i].find(class_="icon").i['class'][1]
trans_name=soup.findAll(class_="media-heading")[i].get_text()
trans_dist=soup.findAll(class_="media neighborhood-destination")[i].find('small').get_text()
transportation.append((tran_type,trans_name,trans_dist))
# Market Stats
price_sqm=soup.find(class_="indicator__amount").find(class_="money").get_text().strip('฿').replace(',',"")
change_last_q=soup.findAll(class_="indicator__amount")[1].get_text().replace('\n',"").strip()
change_last_y=soup.findAll(class_="indicator__amount")[2].get_text().replace('\n',"").strip()
rental_yield=soup.findAll(class_="indicator__amount")[3].get_text().replace('\n',"").strip()
change_last_y_rental_price=soup.findAll(class_="indicator__amount")[4].get_text().replace('\n',"").strip()
#print(price_sqm,change_last_q,change_last_y,rental_yield,change_last_y_rental_price)
# price history graph
price_hist=soup.find(class_="row-fluid background-color-gray project__graph-container").find('script').get_text().split('\n')[3].strip().strip(',').replace('data: ',"")
#print(price_hist)
return (name,district,latitude,longitude,year_built,proj_area,nbr_buildings,nbr_floors,units,\
shops,schools,restaurants,hospital,amenities,transportation,\
price_sqm,change_last_q,change_last_y,rental_yield,change_last_y_rental_price,price_hist)
else:
print("---------Not enough data to build the graph----------",'\n')
##############################################################################
# Run the loop to retrieve data and store data as DataFrame, save as pickle.
start_time = datetime.now()
condo_list=[]
i=0
for link in condo_links_all:
try:
condo_list.append(retrieve(link))
except Exception: # Let the codes go if there is any error.
pass
print(i)
time_elapsed = datetime.now() - start_time
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))
### Give the 'sleep' time = 5 seconds. Space out each request so the server isn’t overwhelmed.
time.sleep(5)
i=i+1
# This is the preventive step...
# You can even clear the list and name a new file to save processing memory.
# Dump the data periodically every 5 iterations.
if (i%5==0):
# Delete 'None' elements from the list.
condo_list = [c for c in condo_list if c is not None]
df = pd.DataFrame(condo_list)
with open('df.pkl', 'wb') as f:
pk.dump(df, f)
# Print out i,len(condo_list), so we can trace back if error occur.
# i is the index of 'condo_links_all'
print('------------------------ dump @ i = ',i,len(condo_list))
print("completed")
# Once complete, dump to pickle and save as 'df_completed.pkl'.
condo_list = [c for c in condo_list if c is not None]
df_completed = pd.DataFrame(condo_list)
with open('df_completed.pkl', 'wb') as f:
pk.dump(df_completed, f)
# export to csv
col_names= ['name','district','latitude','longitude','year_built','proj_area','nbr_buildings','nbr_floors','units',
'shops','schools','restaurants','hospital','amenities','transportation',
'price_sqm','change_last_q','change_last_y','rental_yield','change_last_y_rental_price','price_hist']
df_completed.to_csv("df_completed.csv" ,header=col_names,index=False,encoding='utf-8-sig')
#load csv
df_dirty= pd.read_csv("df_completed.csv", sep=',',encoding='utf-8-sig')