main.r


setwd('..')


# function definitions ##########################

# Bounded logloss
# input:
#     prediction: our prediction
#     actual: real class
# output
#     logarithmic loss of prediction given actual
llfun <- function(actual, prediction) {
        epsilon <- 10e-14
        yhat <- pmin(pmax(prediction, epsilon), 1.-epsilon)
        logloss <- -mean(actual*log(yhat)
                         + (1.-actual)*log(1. - yhat))
        return(logloss)
}


# Prediction based on features and weights
# input:
#     x: features
#     w: weights
# output:
#     probability of p(y = 1 | x ; w)
predict <- function(x, w) {
        #  inner product of w and x
        wTx <- 0.
        for (i in x)  wTx <- wTx +  w[i] * 1.       
        # return probability estimation         
        1./(  1. + exp(  -max(  min(wTx, 20.), -20.)  )  )   
}


update <- function(w, n, x, prediction, actual) {
        for (i in x){
                
                # alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic
                # (p - y) * x[i] is the current gradient
                # note that in our case, if i in x then x[i] = 1
                adp_rate <- alpha / (sqrt(n[i]) + 1.)  
                w[i] <- w[i] - (prediction - actual) * adp_rate
                n[i] <- n[i]+ 1.
        }
        
        list('w' = w, 'n' = n)   
}
        
 
# training #######################################################


library(ff)

# Read the hashed feature generated by one-hot encoding (done off-line)
df1 <- read.csv.ffdf(file= 'hash_train.csv' , VERBOSE=TRUE)


# initialize our model
D <- 2**20
alpha <-  1     # initial learning rate for stochastic gradient descent
w <- rep(0, D)    # initial weights
n <- rep(0, D)    # total time of each feature encountered

# training the stochastic gradient descent logistic regression model 
loss = 0.
for (i in 1:nrow(df1)){
        row <- df1[i,]
        if (row[1] == 1) y = 1.
        else y = 0.
        
        # get the hashed features, exclude the target values
        x <- row[-1]    
 
        # get the prediction based on features and weights
        prediction <- predict(x, w)
        
        # print out current logloss
        loss <- loss + llfun(y,prediction)
        if (i %% 10000 == 0) print(loss/i)

        # update weights and feature encountered times
        wn_update <- update(w, n, x, prediction, y)
        w = wn_update$w     # update weights
        n = wn_update$n    # update times of features encountered 
        
}
 

#  testing #######################################################

# Read the hashed feature generated by one-hot encoding (done off-line)
df2 <- read.csv.ffdf(file= 'hash_test.csv' , VERBOSE=TRUE)
sub_file <- "submission.csv"
# write header
FF <- as.matrix( t( c("id","click")))
write.table (FF, file = sub_file, row.names = F,col.names=F ,sep="," )
for (i in 1:nrow(df2)){
        row <- df2[i,]

        id <-  row[1] 
        # get the hashed features, exclude the id
        x <- row[-1]     

        # get the prediction based on features and weights
        prediction <- predict(x, w)
        
        # write prediction of each testing case
        FF <- as.matrix( t( list(id,prediction)))
        write.table (FF, file = sub_file, row.names = F,col.names=F ,sep="," ,append = T)

}