main1_export_Brandon.rmd

# Setup
```{r, echo=F, message=F}
library(tidygeocoder)
library(tidyverse)
library(broom)
library(dplyr)
library(rafalib)
library(plotly)
library(rvest)
library(lubridate)
library(geosphere)
library(parallel)
```

# Six basic functions:
```{r}
# House scraping: get_df_suburb
get_df_suburb <- function(location = "2151/Parramatta/"){
  # adapted from https://embracingtherandom.com/r/web-scraping/rent-scraping/
  # determine how many pages to scroll through 
  tryCatch({
    location <- gsub("\\s+", "+", location)
    print(location)
    url <- paste0("https://www.auhouseprices.com/sold/list/NSW/", 
                  location, 
                  "1/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
    print(url)
    webpage <- read_html(url)
    
    # get the number of properties and the number of property displayed on each page 
    find_page_number <- webpage  %>%  html_nodes("h2") %>%  html_text() 
    find_page_number <- find_page_number[1]
    numbers <- as.numeric(regmatches(find_page_number, gregexpr("[0-9]+", find_page_number))[[1]])
    end_page <- ceiling(numbers[3] / numbers[2]) # number of total properties / number on page  = total number of pages
    
    df <- NULL
    
    print(paste0(location, ": begins 0/4"))
    print(paste0( "Current suburb: ", location) )
    print(paste0( "Total pages ", end_page) )
    
    for (this_page in c(1:end_page)){
    # print(paste0( "Processing page ", this_page) )
    if (this_page %% 5 == 0){
      print(paste0("Page processed: ", this_page, "/", end_page))
    }
    
    # get website text
    url <- paste0("https://www.auhouseprices.com/sold/list/NSW/", 
                  location, 
                  this_page, 
                  "/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering 
    webpage <- read_html(url)
    
    result <- webpage  %>%  html_nodes("li") %>%  html_text() 
    # end of the relevant content 
    result <-  result[ 1: grep("current", result) ]
    # remove the redundant "listed price" 
    result <-  result[ !grepl("List", result) ]
    # remove the price listed with rent
    result <-  result[ !grepl("Rent", result) ]
    
    # filter information on price and number of bedroom/bathroom/carspace
    price_bedroom  <- result[ grep("\\$", result)]
    price_bedroom <- strsplit( price_bedroom , "\\$")
    bedroom <- lapply(price_bedroom, `[`, 1)
    bedroom <- strsplit(unlist( trimws( bedroom) ) , "\\s+")
    
    price <-  lapply(price_bedroom, `[`, 2)
    price <- trimws(price)
    price <- as.numeric(gsub(",","", price ))
    
    # filter information on sold month and year
    # note sometimes the price is not listed , therefore only get the ones with the price 
    timesold  <- result[ grep("\\$", result)-1]
    timesold <-  trimws( gsub("Sold on","", timesold )) 
    
    # whether to use day month year or just month year
    timesold <- lapply(timesold , function(x){
      check_format <- strsplit(x, "\\s")
      if (length(check_format[[1]]) == 3){
        x <- dmy(x)
      }else if (length(check_format[[1]]) == 2){
        x <- my(x)
      }else{
        x <-  as.Date(paste0(x, "-01-01"))
      }
      x
    })
    timesold <- do.call("c", timesold)
    
    # get address of these properties
    address <- webpage  %>%  html_nodes("h4") %>%  html_text() 
    # end of the relevant content 
    address <-  address[ 1: grep("Auction History", address) -1 ]
    
    #decide which address contain sold price  
    sold_info <- grep("Sold on", result) #entry with sold info
    price_info <- grep("\\$", result) #entry with price info
    contain_price <- sold_info  %in% c(price_info-1) #for every sold entry, the immediate next row should be price, if not, then this sold entry does not have price record 
    address <- address[contain_price] #only record those property that has price recorded
    
    temp_df <- data.frame( address = address, 
                           bedroom = as.numeric( unlist( lapply( bedroom, `[`, 1) ) ) , 
                           bathroom = as.numeric(  unlist( lapply( bedroom, `[`, 2) )) ,  
                           carspace =  as.numeric( unlist( lapply( bedroom, `[`, 3) )), 
                           soldprice = price ,
                           yearsold =timesold )
    
    df <- rbind(df, temp_df)
  }
  # Borrowed from ChatGPT
  # create a new column called "index" with a sequence of numbers
  df <- df %>% mutate(House_ID = 1:nrow(.))
  # move the "index" column to the front of the data frame
  df <- df[, c("House_ID", names(df)[-ncol(df)])]
  
  print(paste0("Page processed: ", this_page, "/", end_page))
  print(paste0(location, ": 1/4: get_df_suburb: creating data frame done!"))
  return(df)
  }, error = function(e) {
    # Error handling code
    # Set the file path and name
    file_path <- "main1_export_Brandon_log/"
    file_name <- "main1_export_Brandon_log.txt"
    
    # Create the directory if it doesn't exist
    if(!dir.exists(file_path)){
      dir.create(file_path)
    }
    
    # Write location to the file
    write(location, file.path(file_path, file_name), append = TRUE)
    return(NULL)
  })
}


add_distance_between <- function(lat, lon, fixed_lat, fixed_lon) {
  dist <- distHaversine(c(lon, lat), c(fixed_lon, fixed_lat))
  return(dist)
}  


get_l_suburb_dist <- function(df_suburb, suburb_lat, suburb_lon, location) {
  l_suburb <- df_suburb %>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
  print(paste0(location, ": 2/4: get_l_suburb: done!"))
  l_suburb_dist <- data.frame(
    l_suburb, distance_to_train_station = apply(
      l_suburb[,c("latitude","longitude")], 1, function(x) add_distance_between(x[1], x[2], suburb_lat, suburb_lon))
  )
  print(paste0(location, ": 3/4: get_l_suburb_dist: done!"))
  return(l_suburb_dist)
}


export_l_suburb_dist_csv <- function(location, l_suburb_dist) {
  # Writing the `l_granville_houseprice.csv` file in "~/csv_cache/"
  if (!dir.exists("~/csv_cache")) {
    dir.create("csv_cache")  # create directory if it doesn't exist
  }
  file_name <- paste0("l_", gsub("/", "_", location), "houseprice.csv")
  print(file_name)
  file_path <- file.path("csv_cache", file_name)  # specify file path
  write.csv(l_suburb_dist, file_path, row.names = FALSE)  # export as CSV file
  print(paste0(location, ": 4/4: export_l_suburb_dist_csv: done!"))
  return("Result: csv export finished")
}


export_a_suburb <- function(location, suburb_lat, suburb_lon) {
  df_suburb <- get_df_suburb(location)
  
  # Check if df_suburb is NULL (meaning an error occurred in get_df_suburb)
  if (is.null(df_suburb)) {
    return(NULL)
  }
  
  l_suburb_dist <- get_l_suburb_dist(df_suburb, suburb_lat, suburb_lon, location)
  export_l_suburb_dist_csv(location, l_suburb_dist)
  print(paste0(location, ": Finish csv export"))
}


export_all_suburbs_parallel <- function(file_name, num_cores = detectCores()) {
  # Read the input file
  suburbs_input <- read.table(file_name, header = FALSE, sep = ",", col.names = c("location", "latitude", "longitude"), strip.white = TRUE, comment.char = "", quote = "")
  
  # Filter out rows starting with a '#' character
  suburbs_input <- suburbs_input[!grepl("^#", suburbs_input$location), ]
  
  # Function to process a single suburb (used by mclapply)
  process_suburb <- function(i) {
    location <- as.character(suburbs_input[i, "location"])
    latitude <- as.numeric(suburbs_input[i, "latitude"])
    longitude <- as.numeric(suburbs_input[i, "longitude"])
    
    export_a_suburb(location, latitude, longitude)
  }
  
  # Process the suburbs in parallel using mclapply
  mclapply(1:nrow(suburbs_input), process_suburb, mc.cores = num_cores)
  
  return(NULL)
}


clear_log <- function() {
  # Set the file path and name
  file_path <- "main1_export_Brandon_log"
  file_name <- "main1_export_Brandon_log.txt"
  
  # Remove the file
  file.remove(file.path(file_path, file_name))
}


export_all_suburbs <- function(file_name) {
  # Clear the log
  a <- clear_log()

  # Read the input file
  suburbs_input <- read.table(file_name, header = FALSE, sep = ",", col.names = c("location", "latitude", "longitude"), strip.white = TRUE, comment.char = "", quote = "")
  
  # Filter out rows starting with a '#' character
  suburbs_input <- suburbs_input[!grepl("^#", suburbs_input$location), ]
  
  # Loop through each row in the input file and call export_a_suburb function
  for (i in 1:nrow(suburbs_input)) {
    location <- as.character(suburbs_input[i, "location"])
    latitude <- as.numeric(suburbs_input[i, "latitude"])
    longitude <- as.numeric(suburbs_input[i, "longitude"])
    
    export_a_suburb(location, latitude, longitude)
  }
  
  return(NULL)
}
```


# Export all
```{r}
export_all_suburbs("main1_INPUT.txt")
```