-
Notifications
You must be signed in to change notification settings - Fork 0
/
main1_export_Brandon.rmd
245 lines (198 loc) · 8.67 KB
/
main1_export_Brandon.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# Setup
```{r, echo=F, message=F}
library(tidygeocoder)
library(tidyverse)
library(broom)
library(dplyr)
library(rafalib)
library(plotly)
library(rvest)
library(lubridate)
library(geosphere)
library(parallel)
```
# Six basic functions:
```{r}
# House scraping: get_df_suburb
get_df_suburb <- function(location = "2151/Parramatta/"){
# adapted from https://embracingtherandom.com/r/web-scraping/rent-scraping/
# determine how many pages to scroll through
tryCatch({
location <- gsub("\\s+", "+", location)
print(location)
url <- paste0("https://www.auhouseprices.com/sold/list/NSW/",
location,
"1/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
print(url)
webpage <- read_html(url)
# get the number of properties and the number of property displayed on each page
find_page_number <- webpage %>% html_nodes("h2") %>% html_text()
find_page_number <- find_page_number[1]
numbers <- as.numeric(regmatches(find_page_number, gregexpr("[0-9]+", find_page_number))[[1]])
end_page <- ceiling(numbers[3] / numbers[2]) # number of total properties / number on page = total number of pages
df <- NULL
print(paste0(location, ": begins 0/4"))
print(paste0( "Current suburb: ", location) )
print(paste0( "Total pages ", end_page) )
for (this_page in c(1:end_page)){
# print(paste0( "Processing page ", this_page) )
if (this_page %% 5 == 0){
print(paste0("Page processed: ", this_page, "/", end_page))
}
# get website text
url <- paste0("https://www.auhouseprices.com/sold/list/NSW/",
location,
this_page,
"/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
webpage <- read_html(url)
result <- webpage %>% html_nodes("li") %>% html_text()
# end of the relevant content
result <- result[ 1: grep("current", result) ]
# remove the redundant "listed price"
result <- result[ !grepl("List", result) ]
# remove the price listed with rent
result <- result[ !grepl("Rent", result) ]
# filter information on price and number of bedroom/bathroom/carspace
price_bedroom <- result[ grep("\\$", result)]
price_bedroom <- strsplit( price_bedroom , "\\$")
bedroom <- lapply(price_bedroom, `[`, 1)
bedroom <- strsplit(unlist( trimws( bedroom) ) , "\\s+")
price <- lapply(price_bedroom, `[`, 2)
price <- trimws(price)
price <- as.numeric(gsub(",","", price ))
# filter information on sold month and year
# note sometimes the price is not listed , therefore only get the ones with the price
timesold <- result[ grep("\\$", result)-1]
timesold <- trimws( gsub("Sold on","", timesold ))
# whether to use day month year or just month year
timesold <- lapply(timesold , function(x){
check_format <- strsplit(x, "\\s")
if (length(check_format[[1]]) == 3){
x <- dmy(x)
}else if (length(check_format[[1]]) == 2){
x <- my(x)
}else{
x <- as.Date(paste0(x, "-01-01"))
}
x
})
timesold <- do.call("c", timesold)
# get address of these properties
address <- webpage %>% html_nodes("h4") %>% html_text()
# end of the relevant content
address <- address[ 1: grep("Auction History", address) -1 ]
#decide which address contain sold price
sold_info <- grep("Sold on", result) #entry with sold info
price_info <- grep("\\$", result) #entry with price info
contain_price <- sold_info %in% c(price_info-1) #for every sold entry, the immediate next row should be price, if not, then this sold entry does not have price record
address <- address[contain_price] #only record those property that has price recorded
temp_df <- data.frame( address = address,
bedroom = as.numeric( unlist( lapply( bedroom, `[`, 1) ) ) ,
bathroom = as.numeric( unlist( lapply( bedroom, `[`, 2) )) ,
carspace = as.numeric( unlist( lapply( bedroom, `[`, 3) )),
soldprice = price ,
yearsold =timesold )
df <- rbind(df, temp_df)
}
# Borrowed from ChatGPT
# create a new column called "index" with a sequence of numbers
df <- df %>% mutate(House_ID = 1:nrow(.))
# move the "index" column to the front of the data frame
df <- df[, c("House_ID", names(df)[-ncol(df)])]
print(paste0("Page processed: ", this_page, "/", end_page))
print(paste0(location, ": 1/4: get_df_suburb: creating data frame done!"))
return(df)
}, error = function(e) {
# Error handling code
# Set the file path and name
file_path <- "main1_export_Brandon_log/"
file_name <- "main1_export_Brandon_log.txt"
# Create the directory if it doesn't exist
if(!dir.exists(file_path)){
dir.create(file_path)
}
# Write location to the file
write(location, file.path(file_path, file_name), append = TRUE)
return(NULL)
})
}
add_distance_between <- function(lat, lon, fixed_lat, fixed_lon) {
dist <- distHaversine(c(lon, lat), c(fixed_lon, fixed_lat))
return(dist)
}
get_l_suburb_dist <- function(df_suburb, suburb_lat, suburb_lon, location) {
l_suburb <- df_suburb %>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
print(paste0(location, ": 2/4: get_l_suburb: done!"))
l_suburb_dist <- data.frame(
l_suburb, distance_to_train_station = apply(
l_suburb[,c("latitude","longitude")], 1, function(x) add_distance_between(x[1], x[2], suburb_lat, suburb_lon))
)
print(paste0(location, ": 3/4: get_l_suburb_dist: done!"))
return(l_suburb_dist)
}
export_l_suburb_dist_csv <- function(location, l_suburb_dist) {
# Writing the `l_granville_houseprice.csv` file in "~/csv_cache/"
if (!dir.exists("~/csv_cache")) {
dir.create("csv_cache") # create directory if it doesn't exist
}
file_name <- paste0("l_", gsub("/", "_", location), "houseprice.csv")
print(file_name)
file_path <- file.path("csv_cache", file_name) # specify file path
write.csv(l_suburb_dist, file_path, row.names = FALSE) # export as CSV file
print(paste0(location, ": 4/4: export_l_suburb_dist_csv: done!"))
return("Result: csv export finished")
}
export_a_suburb <- function(location, suburb_lat, suburb_lon) {
df_suburb <- get_df_suburb(location)
# Check if df_suburb is NULL (meaning an error occurred in get_df_suburb)
if (is.null(df_suburb)) {
return(NULL)
}
l_suburb_dist <- get_l_suburb_dist(df_suburb, suburb_lat, suburb_lon, location)
export_l_suburb_dist_csv(location, l_suburb_dist)
print(paste0(location, ": Finish csv export"))
}
export_all_suburbs_parallel <- function(file_name, num_cores = detectCores()) {
# Read the input file
suburbs_input <- read.table(file_name, header = FALSE, sep = ",", col.names = c("location", "latitude", "longitude"), strip.white = TRUE, comment.char = "", quote = "")
# Filter out rows starting with a '#' character
suburbs_input <- suburbs_input[!grepl("^#", suburbs_input$location), ]
# Function to process a single suburb (used by mclapply)
process_suburb <- function(i) {
location <- as.character(suburbs_input[i, "location"])
latitude <- as.numeric(suburbs_input[i, "latitude"])
longitude <- as.numeric(suburbs_input[i, "longitude"])
export_a_suburb(location, latitude, longitude)
}
# Process the suburbs in parallel using mclapply
mclapply(1:nrow(suburbs_input), process_suburb, mc.cores = num_cores)
return(NULL)
}
clear_log <- function() {
# Set the file path and name
file_path <- "main1_export_Brandon_log"
file_name <- "main1_export_Brandon_log.txt"
# Remove the file
file.remove(file.path(file_path, file_name))
}
export_all_suburbs <- function(file_name) {
# Clear the log
a <- clear_log()
# Read the input file
suburbs_input <- read.table(file_name, header = FALSE, sep = ",", col.names = c("location", "latitude", "longitude"), strip.white = TRUE, comment.char = "", quote = "")
# Filter out rows starting with a '#' character
suburbs_input <- suburbs_input[!grepl("^#", suburbs_input$location), ]
# Loop through each row in the input file and call export_a_suburb function
for (i in 1:nrow(suburbs_input)) {
location <- as.character(suburbs_input[i, "location"])
latitude <- as.numeric(suburbs_input[i, "latitude"])
longitude <- as.numeric(suburbs_input[i, "longitude"])
export_a_suburb(location, latitude, longitude)
}
return(NULL)
}
```
# Export all
```{r}
export_all_suburbs("main1_INPUT.txt")
```