forked from rstudio/fsbench
-
Notifications
You must be signed in to change notification settings - Fork 0
/
_functions.R
117 lines (98 loc) · 3.72 KB
/
_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
target_dir <- Sys.getenv("TARGET_DIR", "../fsbench.work")
target <- function(...) {
file.path(target_dir, ...)
}
times <- list()
# construct a static 100MB dataframe
static_df_rows <- 0.0291268 * 100*1024*1024
static_df <- data.frame(x1 = runif(static_df_rows), x2 = runif(static_df_rows))
benchmark_begin <- function() {
times <<- list()
}
benchmark <- function(name, task, parallelism = NA) {
if (!is.na(parallelism)) {
name <- sprintf("%s(%d)", name, parallelism)
}
message("
# ================================================
# Task: ", name, "
# ================================================
")
dump_cache()
force(task)
if (!inherits(task, "proc_time")) {
warning("Task '", name, "' returned wrong object type. Expected 'proc_time', got '", class(task)[1], "'")
} else {
times[[name]] <<- list(time=task, parallelism=parallelism)
}
}
aggregate_benchmark <- function(name, iterations, task) {
dump_cache()
# perform the timer on the entire set of iterations instead of timing each
# iteration individually - we need to do this because the overhead of using
# system.time is HUGE! calling this in a loop is disastrous for performance,
# and thus causes very inaccurate benchmarks. as a result, each iteration
# should strive to minimize calls unrelated to the actual filesystem being tested
aggregate_time = system.time({
for (i in 1:iterations) {
task(i)
}
})
times[[name]] <<- list(time=aggregate_time, parallelism=NA)
}
benchmark_end <- function() {
df <- do.call(rbind, unname(lapply(times, function(list) {
proc_time <- list[[1]]
parallelism <- as.integer(list[[2]])
pt <- summary(proc_time)
data.frame(user = pt[[1]], system = pt[[2]], elapsed = pt[[3]], parallelism = parallelism)
})))
# strip off the added parallelism in the name when creating the final data frame
df <- cbind(data.frame(task = sub("(.*)\\(([\\d])\\)$", "\\1", names(times), perl=TRUE), stringsAsFactors = FALSE), df)
df <- tibble::as_tibble(df)
results_filename <- Sys.getenv("OUTPUT_FILE", "")
if (!nzchar(results_filename)) {
results_filename <- paste0("results-", format(Sys.time(), format = "%Y%m%d-%H%M%S"), ".csv")
message("No OUTPUT_FILE env var; writing to ", results_filename)
}
write.csv(df, results_filename, row.names = FALSE)
print(df)
}
time_install <- function(pkgname, type = "source", ...) {
pkgs <- as.data.frame(available.packages(repos = "https://cloud.r-project.org"), stringsAsFactors = FALSE)
# https://cran.r-project.org/src/contrib/BH_1.75.0-0.tar.gz
pkgs <- pkgs[pkgs$Package == pkgname,]
if (nrow(pkgs) == 0) {
stop("Unknown package '", pkgname, "'")
}
stopifnot(nrow(pkgs) == 1)
pkgfile <- target(paste0(pkgname, "_", pkgs$Version, ".tar.gz"))
url <- file.path(pkgs$Repository, basename(pkgfile))
download.file(url, pkgfile)
on.exit(if (file.exists(pkgfile)) unlink(pkgfile))
res <- system.time({
utils::install.packages(pkgfile, repos = NULL, type = type, ...)
})
res
}
write_random_csv <- function(path, bytes) {
rows <- 0.0291268 * bytes
df <- data.frame(x1 = runif(rows), x2 = runif(rows))
system.time({
data.table::fwrite(df, file = path, row.names = FALSE, col.names = FALSE)
})
}
write_static_csv <- function(path, num_files, num_iter) {
num_rows <- as.integer(static_df_rows / num_files)
start <- (num_iter - 1) * num_rows
end <- start + num_rows
sub_df <- static_df[start:end, ]
data.table::fwrite(sub_df, file = path, row.names = FALSE, col.names = FALSE)
}
dump_cache <- function() {
if (Sys.info()["sysname"] %in% c("Darwin", "Linux")) {
system("/usr/local/sbin/purge-disk-cache")
} else {
warning("Clearing disk cache is not possible on this platform, benchmark results are unreliable!")
}
}