Skip to content

Latest commit

 

History

History
828 lines (651 loc) · 21.6 KB

ch21.md

File metadata and controls

828 lines (651 loc) · 21.6 KB

Chapter 21 - Exercises - R for Data Science

Francisco Yira Albornoz November 19th 2018

21.2 For loops

21.2.1 Exercises

  1. Write for loops to:

  2. Compute the median of every column in mtcars

median_mtcars <- vector("double", length(mtcars))
for (i in seq_along(mtcars)) {
  median_mtcars[[i]] <- median(mtcars[[i]])
}
median_mtcars
##  [1]  19.200   6.000 196.300 123.000   3.695   3.325  17.710   0.000   0.000
## [10]   4.000   2.000
  1. Determine the type of each column in nycflights13::flights
typecol_flights <- vector("character", length(nycflights13::flights))
for (i in seq_along(nycflights13::flights)) {
  typecol_flights[[i]] <- typeof(nycflights13::flights[[i]])
}
typecol_flights
##  [1] "integer"   "integer"   "integer"   "integer"   "integer"   "double"   
##  [7] "integer"   "integer"   "double"    "character" "integer"   "character"
## [13] "character" "character" "double"    "double"    "double"    "double"   
## [19] "double"
  1. Compute the number of unique values in each column of iris
n_unique_val_iris <- vector("integer", length(iris))
for (i in seq_along(iris)) {
  unique_val_iris <- unique(iris[[i]])
  n_unique_val_iris[[i]] <- length(unique_val_iris)
}
n_unique_val_iris
## [1] 35 23 43 22  3
  1. Generate 10 random normals for each of μ =  − 10, 0, 10, and 100
means <- c(-10, 0, 10, 100)
random_normals <- vector("list", length(means))
for (i in seq_along(means)) {
  random_normals[[i]] <- rnorm(10, mean = means[[i]])
}
random_normals
## [[1]]
##  [1]  -9.976628 -10.280885  -9.877374  -9.566374  -8.989570  -9.878482
##  [7] -10.016595  -9.110575 -11.950144  -8.749682
## 
## [[2]]
##  [1]  0.4931259  1.6412130 -1.0545904 -0.2589900 -0.3901266  0.1553609
##  [7]  0.9281104  0.9369941 -0.5295036  1.0132690
## 
## [[3]]
##  [1]  9.031743 10.041307  9.406935 10.410370  9.326198  9.751394 10.509925
##  [8]  9.818359  8.827326 10.280023
## 
## [[4]]
##  [1] 100.34042 101.72786 100.85984  99.54684 100.41539 100.10072 100.73846
##  [8] 100.81508 100.63034 101.06028
  1. Eliminate the for loop in each of the following examples by taking advantage of an existing function that works with vectors
out <- ""
for (x in letters) {
  out <- stringr::str_c(out, x)
}
out
## [1] "abcdefghijklmnopqrstuvwxyz"

Alternative without for loop:

stringr::str_c(letters, collapse = "")
## [1] "abcdefghijklmnopqrstuvwxyz"
x <- sample(100)
sd <- 0
for (i in seq_along(x)) {
  sd <- sd + (x[i] - mean(x)) ^ 2
}
sd <- sqrt(sd / (length(x) - 1))

Alternative without for loop:

sd(x)
## [1] 29.01149
x <- runif(100)
out <- vector("numeric", length(x))
out[1] <- x[1]
for (i in 2:length(x)) {
  out[i] <- out[i - 1] + x[i]
}
out
cumsum(x)
  1. Combine your function writing and for loop skills:

  2. Write a for loop that prints() the lyrics to the children’s song “Alice the camel”.

names_song <- c("Alice", "Ruby", "Sally", "Felix", "Lilly", "Andy", "Larry", "Sammy")
animals_song <- c("Camel", "Rabbit", "Sloth", "Fox", "Ladybug", "Ant", "Lizard", "Spider")
numbers_song <- c("one", "two", "two", "four", "five", "six", "seven", "eight")
what_they_have <- c("hump", "ears", "toes", "legs", "spots", "legs", "stripes", "legs")

for (i in seq_along(names_song)) {
  for (j in 1:4) {
    writeLines(
      stringr::str_c(
        names_song[[i]],
        " the ",
        animals_song[[i]],
        " has ",
        numbers_song[[i]],
        " ",
        what_they_have[[i]],
        "."
      )
    )
  }
  writeLines(stringr::str_c("Go ", names_song[[i]], " go!\n"))
  
}
## Alice the Camel has one hump.
## Alice the Camel has one hump.
## Alice the Camel has one hump.
## Alice the Camel has one hump.
## Go Alice go!
## 
## Ruby the Rabbit has two ears.
## Ruby the Rabbit has two ears.
## Ruby the Rabbit has two ears.
## Ruby the Rabbit has two ears.
## Go Ruby go!
## 
## Sally the Sloth has two toes.
## Sally the Sloth has two toes.
## Sally the Sloth has two toes.
## Sally the Sloth has two toes.
## Go Sally go!
## 
## Felix the Fox has four legs.
## Felix the Fox has four legs.
## Felix the Fox has four legs.
## Felix the Fox has four legs.
## Go Felix go!
## 
## Lilly the Ladybug has five spots.
## Lilly the Ladybug has five spots.
## Lilly the Ladybug has five spots.
## Lilly the Ladybug has five spots.
## Go Lilly go!
## 
## Andy the Ant has six legs.
## Andy the Ant has six legs.
## Andy the Ant has six legs.
## Andy the Ant has six legs.
## Go Andy go!
## 
## Larry the Lizard has seven stripes.
## Larry the Lizard has seven stripes.
## Larry the Lizard has seven stripes.
## Larry the Lizard has seven stripes.
## Go Larry go!
## 
## Sammy the Spider has eight legs.
## Sammy the Spider has eight legs.
## Sammy the Spider has eight legs.
## Sammy the Spider has eight legs.
## Go Sammy go!
  1. Convert the nursery rhyme “ten in the bed” to a function. Generalise it to any number of people in any sleeping structure.
library(stringr)
library(english)
## Warning: package 'english' was built under R version 4.1.3
people_fell_song <-
  function(n_people = 10,
           sleeping_structure = "bed") {
    # Input validation
    if (n_people < 1)
      stop("n_people should be greater than zero")
    
    n_people <- as.integer(n_people)
    if (is.na(n_people))
      stop("n_people should be a valid integer")
    
    
    for (i in 1:n_people) {
      # Turning number of people into a word
      people_in <- as.english(n_people + 1 - i)
      
      # Handling special case for first strophe
      if (i == 1) {
        writeLines("Here we go!")
      } else {
        writeLines(str_c(str_to_title(people_in), "!"))
      }
      
      # Lines that appear in every strophe
      writeLines(str_c("There were ", people_in, " in the ", sleeping_structure))
      writeLines("and the little one said,")
      
      # Handling special case for last strophe
      if (people_in > 1) {
        writeLines('"Roll over, roll over."')
        writeLines("So they all rolled over and one fell out.\n")
      } else {
        writeLines('"Good night!"')
      }
      
    }
  }

people_fell_song(5, "sofa")
## Here we go!
## There were five in the sofa
## and the little one said,
## "Roll over, roll over."
## So they all rolled over and one fell out.
## 
## Four!
## There were four in the sofa
## and the little one said,
## "Roll over, roll over."
## So they all rolled over and one fell out.
## 
## Three!
## There were three in the sofa
## and the little one said,
## "Roll over, roll over."
## So they all rolled over and one fell out.
## 
## Two!
## There were two in the sofa
## and the little one said,
## "Roll over, roll over."
## So they all rolled over and one fell out.
## 
## One!
## There were one in the sofa
## and the little one said,
## "Good night!"
  1. Convert the song “99 bottles of beer on the wall” to a function. Generalise to any number of any vessel containing any liquid on any surface.
library(stringr)

song_bottles_beer <- function(n_vessels = 99, vessel = "bottles", liquid = "beer", surface = "wall") {
    # Input validation
    if (n_vessels < 1)
      stop("n_vessels should be greater than zero")
    
    n_vessels <- as.integer(n_vessels)
    if (is.na(n_vessels))
      stop("n_vessels should be a valid integer")
    
    
    for (i in 1:(n_vessels + 1)) {
      # Turning number of bottles into a word
      vessels_left <- (n_vessels + 1 - i)
      
      if (vessels_left >= 1) {
        writeLines(str_c(vessels_left, " ", vessel, " of ", liquid, " on the ", surface, ", ", vessels_left, " ", vessel, " of ", liquid, "."))
      }
      
      if (vessels_left >= 2) {
        writeLines(str_c("Take one down and pass it around, ", (vessels_left - 1), " ", vessel, " of ", liquid, " on the ", surface, ".\n"))
      }
      
      if (vessels_left == 1) {
        writeLines(str_c("Take one down and pass it around, no more ", vessel," of ", liquid," on the ", surface,".\n"))
      }
      
      if (vessels_left == 0) {
        writeLines(str_c("No more ", vessel," of ", liquid," on the ", surface,", no more ", vessel, " of ", liquid, "."))
        writeLines(str_c("Go to the store and buy some more, ", n_vessels," ", vessel," of ", liquid," on the ", surface,"."))
      } 
        
   
      
    }
  }

song_bottles_beer(5, vessel = "glasses", liquid = "water", surface = "table")
## 5 glasses of water on the table, 5 glasses of water.
## Take one down and pass it around, 4 glasses of water on the table.
## 
## 4 glasses of water on the table, 4 glasses of water.
## Take one down and pass it around, 3 glasses of water on the table.
## 
## 3 glasses of water on the table, 3 glasses of water.
## Take one down and pass it around, 2 glasses of water on the table.
## 
## 2 glasses of water on the table, 2 glasses of water.
## Take one down and pass it around, 1 glasses of water on the table.
## 
## 1 glasses of water on the table, 1 glasses of water.
## Take one down and pass it around, no more glasses of water on the table.
## 
## No more glasses of water on the table, no more glasses of water.
## Go to the store and buy some more, 5 glasses of water on the table.
  1. It’s common to see for loops that don’t preallocate the output and instead increase the length of a vector at each step:
output <- vector("integer", 0)
for (i in seq_along(x)) {
  output <- c(output, lengths(x[[i]]))
}
output

How does this affect performance? Design and execute an experiment.

library(microbenchmark)

func_wo_preallocate <- function(x) {
  output <- vector("integer", 0)
  for (i in seq_along(x)) {
    output <- c(output, lengths(x[[i]]))
  }
  output
}

func_with_preallocate <- function(x) {
  output <- vector("integer", length(x))
  for (i in seq_along(x)) {
    output[[i]] <- lengths(x[[i]])
  }
 output  
}

test_vec <- 1:1000

microbenchmark(func_wo_preallocate(test_vec),
               func_with_preallocate(test_vec),
               times = 1000L)
## # A tibble: 2,000 x 2
##    expr                               time
##    <fct>                             <dbl>
##  1 func_with_preallocate(test_vec) 3376600
##  2 func_with_preallocate(test_vec)  402900
##  3 func_wo_preallocate(test_vec)   3551500
##  4 func_with_preallocate(test_vec)  455200
##  5 func_with_preallocate(test_vec)  491000
##  6 func_with_preallocate(test_vec)  431700
##  7 func_wo_preallocate(test_vec)   1040000
##  8 func_with_preallocate(test_vec)  399100
##  9 func_wo_preallocate(test_vec)   1258400
## 10 func_wo_preallocate(test_vec)   1674400
## # ... with 1,990 more rows

21.3 For loop variations

21.3.5 Exercises

  1. Imagine you have a directory full of CSV files that you want to read in. You have their paths in a vector, files <- dir("data/", pattern = "\\.csv$", full.names = TRUE), and now want to read each one with read_csv(). Write the for loop that will load them into a single data frame.
output <- vector("list", length(files))
for (i in seq_along(files)) {
  output[[i]] <- read_csv(files[[i]])
}

my_big_df <- dplyr::bind_rows(output)
  1. What happens if you use for (nm in names(x)) and x has no names? What if only some of the elements are named? What if the names are not unique?
my_vec <- 1:5
names(my_vec) <- c("a", "a", "b", "c")

for (nm in names(my_vec)) {
  print(my_vec[[nm]])
}
## [1] 1
## [1] 1
## [1] 3
## [1] 4

## Error in my_vec[[nm]]: subscript out of bounds

If the names are not unique then the subscripting will return the first element with that name each time that the duplicated name appears. If only some of the elements are named, there will be an error.

  1. Write a function that prints the mean of each numeric column in a data frame, along with its name. For example, show_mean(iris) would print:
show_mean(iris)
#> Sepal.Length: 5.84
#> Sepal.Width:  3.06
#> Petal.Length: 3.76
#> Petal.Width:  1.20

(Extra challenge: what function did I use to make sure that the numbers lined up nicely, even though the variable names had different lengths?)

show_mean <- function(x) {
  
  for (i in seq_along(mtcars)) {
    mean_col <- round(mean(mtcars[[i]]), 2)
    
    name_with_colon <- str_c(names(mtcars)[[i]], ":")
    
    name_pad <- str_pad(name_with_colon, width = 6, side = "right")
    
    print(str_c(name_pad, mean_col, sep = " "))
  }
  
}

show_mean(mtcars)
## [1] "mpg:   20.09"
## [1] "cyl:   6.19"
## [1] "disp:  230.72"
## [1] "hp:    146.69"
## [1] "drat:  3.6"
## [1] "wt:    3.22"
## [1] "qsec:  17.85"
## [1] "vs:    0.44"
## [1] "am:    0.41"
## [1] "gear:  3.69"
## [1] "carb:  2.81"
  1. What does this code do? How does it work?
trans <- list( 
  disp = function(x) x * 0.0163871,
  am = function(x) {
    factor(x, labels = c("auto", "manual"))
  }
)
for (var in names(trans)) {
  mtcars[[var]] <- trans[[var]](mtcars[[var]])
}

trans is a list where each element is a function, designed to do a certain transformation over a specific column in mtcars. Also, each function is named after the column over which it is supossed to do the transformation.

The for cycle selects the corresponding columns in mtcars using the functions names, and then replaces them with the modified versions, which are obtained applying the corresponding from the list.

21.4 For loops vs. functionals

21.4.1 Exercises

  1. Read the documentation for apply(). In the 2d case, what two for loops does it generalise?

It generalises two nested for loops iterating over a two dimensions array.

  1. Adapt col_summary() so that it only applies to numeric columns. You might want to start with an is_numeric() function that returns a logical vector that has a TRUE corresponding to each numeric column.
is_col_numeric <- function(df) {
  out <- vector("logical", length(df))
  for (i in seq_along(df)) {
    out[[i]] <- class(df[[i]]) == "numeric"
  }
  out
}


col_summary <- function(df, fun) {
  subset_df <- df[is_col_numeric(df)]
  out <- vector("double", length(subset_df))
  for (i in seq_along(subset_df)) {
    out[i] <- fun(subset_df[[i]])
  }
  out
}

col_summary(iris, mean)
## [1] 5.843333 3.057333 3.758000 1.199333

21.5 The map functions

21.5.3 Exercises

  1. Write code that uses one of the map functions to:

  2. Compute the mean of every column in mtcars.

map_dbl(mtcars, mean)
##        mpg        cyl       disp         hp       drat         wt       qsec 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250  17.848750 
##         vs         am       gear       carb 
##   0.437500   0.406250   3.687500   2.812500
  1. Determine the type of each column in nycflights13::flights
map_chr(mtcars, typeof)
##      mpg      cyl     disp       hp     drat       wt     qsec       vs 
## "double" "double" "double" "double" "double" "double" "double" "double" 
##       am     gear     carb 
## "double" "double" "double"
  1. Compute the number of unique values in each column of iris
map_dbl(iris, ~length(unique(.)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           35           23           43           22            3
  1. Generate 10 random normals for each of μ =  − 10, 0, 10, and 100
means <- c(-10, 0, 10, 100)
map(means, ~rnorm(10, mean = .))
## [[1]]
##  [1] -10.858736  -9.793184 -10.223092  -9.057800 -10.205051  -7.976423
##  [7] -10.664724  -9.251421  -7.369755  -9.666718
## 
## [[2]]
##  [1] -0.30214014  1.26317698 -0.31486994 -1.50665354 -0.06776626  0.45249957
##  [7]  0.77038067 -0.90672827  0.48784884  0.82800872
## 
## [[3]]
##  [1] 11.000797  9.875195  9.202104  7.984341  9.764668 10.294582  9.869245
##  [8]  9.224266  9.032104  9.513981
## 
## [[4]]
##  [1]  99.00828 100.47705 100.54046  98.20056  99.78811  99.44104  97.80572
##  [8] 100.26525 100.45618  99.86985
  1. How can you create a single vector that for each column in a data frame indicates whether or not it’s a factor?
map_lgl(mtcars, is.factor)
##   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
  1. What happens when you use the map functions on vectors that aren’t lists? What does map(1:5, runif) do? Why?
map(1:5, runif)
## [[1]]
## [1] 0.1748115
## 
## [[2]]
## [1] 0.705458761 0.002346674
## 
## [[3]]
## [1] 0.4691681 0.8412935 0.2413714
## 
## [[4]]
## [1] 0.71314932 0.06280471 0.55132120 0.79630701
## 
## [[5]]
## [1] 0.8952573 0.3166994 0.4248266 0.1931561 0.6947861

It iterates over each element of the vector, regardless of whether it is an atomic vector or a list. In this case, it passes each element of 1:5 to the function runif as the first argument (n).

  1. What does map(-2:2, rnorm, n = 5) do? Why? What does map_dbl(-2:2, rnorm, n = 5) do? Why?
map(-2:2, rnorm, n = 5)
## [[1]]
## [1] -1.140069 -3.528580 -2.989716 -1.835792 -2.735785
## 
## [[2]]
## [1] -1.14310491 -0.80160919 -0.17416975 -0.07329977 -1.25868572
## 
## [[3]]
## [1]  1.2460069  0.8118463 -1.5559002 -1.3585788 -0.7287338
## 
## [[4]]
## [1] 1.3607947 0.5690289 0.8809755 1.0549564 2.3954550
## 
## [[5]]
## [1] 1.145589 2.537922 1.339379 2.186574 2.697248

It passes each element of -2:2 to rnorm() as the second argument (mean), since the first argument (n) is specified through .... Therefore, it creates a list of five vectors of length five, generated from normal distributions with means going from -2 to 2.

map_dbl(-2:2, rnorm, n = 5)
## Error in `stop_bad_type()`:
## ! Result 1 must be a single double, not a double vector of length 5

This code tries to do the same, but it fails since map_dbl() tries to coerce the output to an atomic numeric vector, but that can not be done directly because each element of the result has length more than 1.

  1. Rewrite map(x, function(df) lm(mpg ~ wt, data = df)) to eliminate the anonymous function.
map(x, ~lm(mpg ~ wt, data = .))

21.9 Other patterns of for loops

21.9.3 Exercises

  1. Implement your own version of every() using a for loop. Compare it with purrr::every(). What does purrr’s version do that your version doesn’t?
x <- list(1:5, letters, list(10))

my_every <- function(x, fun) {
  output <- vector("logical", length(x))
  for (i in seq_along(x)) {
    output[[i]] <- fun(x[[i]])
  }
  all(output)
}

x %>% my_every(is_vector)
## [1] TRUE

purrr version allows to pass additional arguments to the predicate function using ....

  1. Create an enhanced col_summary() that applies a summary function to every numeric column in a data frame
is_col_numeric <- function(df) {
  map_lgl(df, ~class(.) == "numeric")
}


col_summary <- function(df, fun) {
  subset_df <- df[is_col_numeric(df)]
  
  map_dbl(subset_df, fun)
}

col_summary(iris, mean)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333
  1. A possible base R equivalent of col_summary() is:
col_sum3 <- function(df, f) {
  is_num <- sapply(df, is.numeric)
  df_num <- df[, is_num]

  sapply(df_num, f)
}

But it has a number of bugs as illustrated with the following inputs:

df <- tibble(
  x = 1:3, 
  y = 3:1,
  z = c("a", "b", "c")
)
# OK
col_sum3(df, mean)
## x y 
## 2 2
# Has problems: don't always return numeric vector
col_sum3(df[1:2], mean)
## x y 
## 2 2
col_sum3(df[1], mean)
## x 
## 2
col_sum3(df[0], mean)
## Error in `vectbl_as_col_location()`:
## ! Must subset columns with a valid subscript vector.
## x Subscript `is_num` has the wrong type `list`.
## i It must be logical, numeric, or character.

In the case where df is subsetted by the empty vector numeric(0), sapply(df, is.numeric) returns a list instead of a logical, causing the subsequent subsetting operation (df[, is_num) to fail (since we can not use a list to do subsetting, only a logical vector).

Also, even if the subsetting operation succeded in this case, the output wouldn’t be a numeric vector, because sapply(df_num, f) returns a list when the result has length zero.

We can avoid both errors by replacing sapply with map_lgl and map_dbl in each call, respectively.