--- title: "Performance of the bit package" author: "Dr. Jens Oehlschlägel" date: '`r Sys.Date()`' output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Performance of the bit package} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, echo = FALSE, results = "hide", message = FALSE} knitr::opts_chunk$set(collapse = TRUE, comment = "#>") library(bit) library(microbenchmark) # rmarkdown::render("vignettes/bit-performance.Rmd") # these are the real settings for the performance vignette times <- 5 Domain <- c(small=1e3, big=1e6) Sample <- c(small=1e3, big=1e6) # these are the settings to keep the cost of CRAN low #times <- 5 #Domain <- c(small=1e1, big=1e3) #Sample <- c(small=1e1, big=1e3) pagebreak <- function() { if (knitr::is_latex_output()) "\\newpage" else '
' } ``` --- ## A performance example Before we measure performance of the main functionality of the package, note that something simple as '(a:b)[-i]' can and has been accelerated in this package: ```{r, echo=TRUE, results='asis'} a <- 1L b <- 1e7L i <- sample(a:b, 1e3) x <- c( R = median(microbenchmark((a:b)[-i], times=times)$time), bit = median(microbenchmark(bit_rangediff(c(a, b), i), times=times)$time), merge = median(microbenchmark(merge_rangediff(c(a, b), bit_sort(i)), times=times)$time) ) knitr::kable(as.data.frame(as.list(x / x["R"] * 100)), caption="% of time relative to R", digits=1) ``` The vignette is compiled with the following performance settings: `r times` replications with domain size small `r Domain["small"]` and big `r Domain["big"]`, sample size small `r Sample["small"]` and big `r Sample["big"]`. ## Boolean data types > "A designer knows he has achieved perfection not when there is nothing left to add, but when there is nothing left to take away." "Il semble que la perfection soit atteinte non quand il n'y a plus rien à ajouter, mais quand il n'y a plus rien à retrancher" (Antoine de St. Exupery, Terre des Hommes (Gallimard, 1939), p. 60.) We compare memory consumption (n=`r format(Sample[["big"]])`) and runtime (median of `r times` replications) of the different `booltype`s for the following filter scenarios: ```{r, echo=FALSE, results='asis', echo=3:5} # TODO(r-lib/lintr#773): nolint as a chunk option. # nolint start: strings_as_factors_linter. knitr::kable( data.frame( coin="random 50%", often="random 99%", rare="random 1%", chunk="contiguous chunk of 5%" ), caption="selection characteristic" ) # nolint end: strings_as_factors_linter. ``` ```{r, echo=FALSE, results='asis'} B <- booltypes[c("logical", "bit", "bitwhich", "which", "ri")] M <- c("size", "[]", "[which]", "[which]<-TRUE", "[]<-logical", "!", "&", "|", "==", "!=", "summary") G <- list( coin = function(n) sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.5, 0.5)), often = function(n) sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.01, 0.99)), rare = function(n) sample(c(FALSE, TRUE), n, replace=TRUE, prob=c(0.99, 0.01)), chunk = function(n) ri(n %/% 20, 2L * n %/% 20, n) ) X <- vector("list", length(B) * length(G)) dim(X) <- c(booltype=length(B), data=length(G)) dimnames(X) <- list(booltype=names(B), data=names(G)) tim <- array(NA, dim=c(booltype=length(B), metric=length(M), data=length(G)), dimnames=list(booltype=names(B), metric=M, data=names(G)) ) for (g in names(G)) { x <- G[[g]](Sample[["big"]]) if (g %in% c("coin", "often", "rare")) w <- as.which(as.logical(x)) for (b in B) { if (booltypes[[b]] < 'ri' || (b == 'ri' && g == 'chunk')) { X[[b, g]] <- as.booltype(x, b) if (g %in% c("coin", "often", "rare") && b %in% c("logical", "bit", "bitwhich")) { l <- as.booltype(logical(Sample[["big"]]), b) tim[b, "[which]", g] <- median(microbenchmark(l[w], times=times)$time) tim[b, "[which]<-TRUE", g] <- median(microbenchmark(l[w] <- TRUE, times=times)$time) tim[b, "[]", g] <- median(microbenchmark(l[], times=times)$time) tim[b, "[]<-logical", g] <- median(microbenchmark(l[] <- x, times=times)$time) } tim[b, "size", g] <- object.size(X[[b, g]]) } } } for (g in names(G)) { for (b in c("logical", "bit", "bitwhich")) { x <- X[[b, g]] if (!is.null(x)) { tim[b, "!", g] <- median(microbenchmark(!x, times=times)$time) tim[b, "&", g] <- median(microbenchmark(x & x, times=times)$time) tim[b, "|", g] <- median(microbenchmark(x | x, times=times)$time) tim[b, "==", g] <- median(microbenchmark(x == x, times=times)$time) tim[b, "!=", g] <- median(microbenchmark(x != x, times=times)$time) tim[b, "summary", g] <- median(microbenchmark(summary.booltype(x), times=times)$time) } } } i <- match("size", M) for (b in rev(names(B))) { # logical was in first position, so we do this last! tim[b, i, ] <- 100 * tim[b, i, ] / tim["logical", i, ] tim[b, -i, ] <- 100 * tim[b, -i, ] / max(tim["logical", -i, ], na.rm=TRUE) } #rm(X) ``` There are substantial savings in skewed filter situations: ```{r, echo=FALSE, fig.cap = "% size and execution time for bit (b) and bitwhich (w) relative to logical (R) in the 'rare' scenario"} x <- tim[1:3, , "rare"] m <- rep("", ncol(x)) m <- as.vector(rbind(m, colnames(x), m)) dotchart(x, xlim=c(0, max(100, max(x))), labels=m, pch=c("R", "b", "w"), col=c("black", "blue", "red"), main="% size and timings in 'rare' scenario", sub="l='logical' b='bit' w='bitwhich' % of max(R) in all scenarios" ) ``` ```{r, echo=FALSE, fig.cap = "% size and execution time for bit (b) and bitwhich (w) relative to logical (R) in the 'often' scenario"} x <- tim[1:3, , "often"] dotchart(x, xlim=c(0, max(100, max(x))), labels=m, pch=c("R", "b", "w"), col=c("black", "blue", "red"), main="% size and timings in 'often' scenario", sub="l='logical' b='bit' w='bitwhich' % of max(R) in all scenarios" ) ``` Even in non-skewed situations the new booltypes are competitive: ```{r, echo=FALSE, fig.cap = "% size and execution time for bit (b) and bitwhich (w) relative to logical (R) in the 'coin' scenario"} x <- tim[1:3, , "coin"] dotchart(x, xlim=c(0, max(100, max(x))), labels=m, pch=c("R", "b", "w"), col=c("black", "blue", "red"), main="% size and timings in 'coin' scenario", sub="l='logical' b='bit' w='bitwhich' % of max(R) in all scenarios" ) ``` Detailed tables follow. `r pagebreak()` ### % memory consumption of filter ```{r, echo=FALSE, results='asis'} knitr::kable(round(tim[, "size", ], 1), caption="% bytes of logical") ``` ### % time extracting ```{r, echo=FALSE, results='asis'} knitr::kable(round(tim[, "[]", ], 1), caption="% time of logical") ``` ### % time assigning ```{r, echo=FALSE, results='asis'} knitr::kable(round(tim[, "[]<-logical", ], 1), caption="% time of logical") ``` ### % time subscripting with 'which' ```{r, echo=FALSE, results='asis'} knitr::kable(round(tim[, "[which]", ], 1), caption="% time of logical") ``` ### % time assigning with 'which' ```{r, echo=FALSE, results='asis'} knitr::kable(round(tim[, "[which]<-TRUE", ], 1), caption="% time of logical") ``` ### % time Boolean NOT ```{r, echo=FALSE, results='asis'} knitr::kable(round(tim[, "!", ], 1), caption="% time for Boolean NOT") ``` ### % time Boolean AND ```{r, echo=FALSE, results='asis'} knitr::kable(round(tim[, "&", ], 1), caption="% time for Boolean &") ``` ### % time Boolean OR ```{r, echo=FALSE, results='asis'} knitr::kable(round(tim[, "|", ], 1), caption="% time for Boolean |") ``` ### % time Boolean EQUALITY ```{r, echo=FALSE, results='asis'} knitr::kable(round(tim[, "==", ], 1), caption="% time for Boolean ==") ``` ### % time Boolean XOR ```{r, echo=FALSE, results='asis'} knitr::kable(round(tim[, "!=", ], 1), caption="% time for Boolean !=") ``` ### % time Boolean SUMMARY ```{r, echo=FALSE, results='asis'} knitr::kable(round(tim[, "summary", ][1:2, 1:2], 1), caption="% time for Boolean summary") ``` --- ## Fast methods for `integer` set operations > "The space-efficient structure of bitmaps dramatically reduced the run time of sorting" (Jon Bently, Programming Pearls, Cracking the oyster, p. 7) ```{r, echo=FALSE, results='asis'} binaryDomain <- list( smallsmall = rep(Domain["small"], 2L), smallbig=Domain, bigsmall=rev(Domain), bigbig=rep(Domain["big"], 2L) ) binarySample <- list( smallsmall = rep(Sample["small"], 2L), smallbig=Sample, bigsmall=rev(Sample), bigbig=rep(Sample["big"], 2L) ) M <- c("R", "bit", "merge") G <- c("sort", "sortunique") D <- c("unsorted", "sorted") sortM <- vector("list", length(M) * length(G)) dim(sortM) <- c(method=length(M), goal=length(G)) dimnames(sortM) <- list(method=M, goal=G) sortM[["R", "sort"]] <- sort sortM[["R", "sortunique"]] <- function(x) sort(unique(x)) sortM[["bit", "sort"]] <- bit_sort sortM[["bit", "sortunique"]] <- bit_sort_unique timsort <- array(NA_integer_, dim=c(M=2, G=length(G), D=length(D), N=length(Domain)), dimnames=list(M=M[1:2], G=G, D=D, N=names(Domain)) ) for (n in names(Domain)) { x <- sample(Domain[[n]], Sample[[n]], replace = TRUE) d <- "unsorted" for (m in c("R", "bit")) { for (g in G) { timsort[m, g, d, n] <- median(microbenchmark(sortM[[m, g]](x), times=times)$time) } } x <- bit_sort(x) d <- "sorted" for (m in 1:2) { for (g in G) { timsort[m, g, d, n] <- median(microbenchmark(sortM[[m, g]](x), times=times)$time) } } } binaryU <- c("match", "in", "notin", "union", "intersect", "setdiff", "symdiff", "setequal", "setearly") binaryM <- vector("list", length(M) * length(binaryU)) dim(binaryM) <- c(method=length(M), task=length(binaryU)) dimnames(binaryM) <- list(method=M, task=binaryU) binaryM[["R", "match"]] <- match binaryM[["merge", "match"]] <- merge_match binaryM[["R", "in"]] <- get("%in%") binaryM[["bit", "in"]] <- bit_in binaryM[["merge", "in"]] <- merge_in binaryM[["R", "notin"]] <- function(x, y) !(x %in% y) binaryM[["bit", "notin"]] <- function(x, y) !bit_in(x, y) binaryM[["merge", "notin"]] <- merge_notin binaryM[["R", "union"]] <- union binaryM[["bit", "union"]] <- bit_union binaryM[["merge", "union"]] <- merge_union binaryM[["R", "intersect"]] <- intersect binaryM[["bit", "intersect"]] <- bit_intersect binaryM[["merge", "intersect"]] <- merge_intersect binaryM[["R", "setdiff"]] <- setdiff binaryM[["bit", "setdiff"]] <- bit_setdiff binaryM[["merge", "setdiff"]] <- merge_setdiff binaryM[["R", "symdiff"]] <- function(x, y) union(setdiff(x, y), setdiff(y, x)) binaryM[["bit", "symdiff"]] <- bit_symdiff binaryM[["merge", "symdiff"]] <- merge_symdiff binaryM[["R", "setequal"]] <- function(x, y) setequal(x, x) # we compare x to x which avoids early termination binaryM[["bit", "setequal"]] <- function(x, y) bit_setequal(x, x) binaryM[["merge", "setequal"]] <- function(x, y) merge_setequal(x, x) binaryM[["R", "setearly"]] <- function(x, y) setequal(x, y) # we compare x to x which avoids early termination binaryM[["bit", "setearly"]] <- function(x, y) bit_setequal(x, y) binaryM[["merge", "setearly"]] <- function(x, y) merge_setequal(x, y) unaryU <- c("unique", "duplicated", "anyDuplicated", "sumDuplicated") unaryM <- vector("list", length(M) * length(unaryU)) dim(unaryM) <- c(method=length(M), task=length(unaryU)) dimnames(unaryM) <- list(method=M, task=unaryU) unaryM[["R", "unique"]] <- unique unaryM[["bit", "unique"]] <- bit_unique unaryM[["merge", "unique"]] <- merge_unique unaryM[["R", "duplicated"]] <- duplicated unaryM[["bit", "duplicated"]] <- bit_duplicated unaryM[["merge", "duplicated"]] <- merge_duplicated unaryM[["R", "anyDuplicated"]] <- anyDuplicated unaryM[["bit", "anyDuplicated"]] <- bit_anyDuplicated unaryM[["merge", "anyDuplicated"]] <- merge_anyDuplicated unaryM[["R", "sumDuplicated"]] <- function(x) sum(duplicated(x)) unaryM[["bit", "sumDuplicated"]] <- bit_sumDuplicated unaryM[["merge", "sumDuplicated"]] <- merge_sumDuplicated tim <- array(NA_integer_, dim=c(M=length(M), U=length(unaryU) + length(binaryU), N=length(binaryDomain), D=length(D)), dimnames=list(M=M, U=c(unaryU, binaryU), N=names(binaryDomain), D=D) ) for (n in names(binaryDomain)) { xnam <- names(binaryDomain[[n]])[1] ynam <- names(binaryDomain[[n]])[2] x <- sample(binaryDomain[[n]][1], binarySample[[n]][1], replace = FALSE) y <- sample(binaryDomain[[n]][2], binarySample[[n]][2], replace = FALSE) d <- "unsorted" if (length(x) == length(y)) { for (u in unaryU) { for (m in setdiff(M, "merge")) { f <- unaryM[[m, u]] if (!is.null(f)) tim[m, u, n, d] <- median(microbenchmark(f(x), times=times)$time) } } } for (u in binaryU) { for (m in setdiff(M, "merge")) { f <- binaryM[[m, u]] if (!is.null(f)) tim[m, u, n, d] <- median(microbenchmark(f(x, y), times=times)$time) } } x <- bit_sort(x) y <- bit_sort(y) d <- "sorted" if (length(x) == length(y)) { for (u in unaryU) { for (m in M) { f <- unaryM[[m, u]] if (!is.null(f)) { tim[m, u, n, d] <- median(microbenchmark(f(x), times=times)$time) # now plug-in measures for unsorted merge if (m == "merge") { tim["merge", u, n, "unsorted"] <- timsort["bit", "sort", "unsorted", xnam] + tim["merge", u, n, "sorted"] } } } } } for (u in binaryU) { for (m in M) { f <- binaryM[[m, u]] if (!is.null(f)) { tim[m, u, n, d] <- median(microbenchmark(f(x, y), times=times)$time) # now plug-in measures for unsorted merge if (m == "merge") { tim["merge", u, n, "unsorted"] = timsort["bit", "sort", "unsorted", xnam] + timsort["bit", "sort", "unsorted", ynam] + tim["merge", u, n, "sorted"] } } } } } ``` ```{r, echo=FALSE, fig.cap = "Execution time for R (R) and bit (b)"} y <- timsort[, , , "big"] y <- 100 * y / max(y["R", , ], na.rm=TRUE) oldpar <- par(mfrow=c(2, 1), mar=c(5, 8, 2, 1)) x <- y[, , "unsorted"] dotchart(x, xlim=c(0, max(100, max(y))), labels="", pch=c("R", "b"), xlab="execution time", main="unsorted", col=c("red", "blue") ) x <- y[, , "sorted"] dotchart(x, xlim=c(0, max(100, max(y))), labels="", pch=c("R", "b"), xlab="execution time", main="sorted", col=c("red", "blue") ) par(oldpar) ``` ```{r, echo=FALSE, results='hide'} tim2 <- tim for (n in names(binaryDomain)) { for (d in D) { tim2[, , n, d] <- 100 * tim[, , n, d] / max(tim["R", , n, d], na.rm=TRUE) } } ``` `r pagebreak()` ```{r, echo=FALSE, fig.cap = "Execution time for R, bit and merge relative to most expensive R in 'unsorted bigbig' scenario"} y <- tim2[, , "bigbig", ] y <- 100 * y / max(y["R", , ], na.rm=TRUE) x <- y[, , "unsorted"] m <- rep("", ncol(x)) m <- as.vector(rbind(m, colnames(x), m)) dotchart(x, xlim=c(0, max(100, max(y, na.rm=TRUE))), labels=m, pch=c("R", "b", "m"), col=c("red", "blue", "black"), main="Timings in 'unsorted bigbig' scenario", sub="R='hash' b='bit' m='merge'" ) ``` ```{r, echo=FALSE, fig.cap = "Execution time for R, bit and merge in 'sorted bigbig' scenario"} x <- y[, , "sorted"] dotchart(x, xlim=c(0, max(y, na.rm=TRUE)), labels=m, pch=c("R", "b", "m"), col=c("red", "blue", "black"), main="Timings in 'sorted bigbig' scenario", sub="R='hash' b='bit' m='merge'" ) ``` `r pagebreak()` ### % time for sorting ```{r, echo=FALSE, results='asis'} x <- 100 * timsort["bit", , , ] / timsort["R", , , ] s <- "sorted" knitr::kable(x[, s, ], caption=paste(s, "data relative to R's sort"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, s, ], caption=paste(s, "data relative to R's sort"), digits=1) ``` ### % time for unique ```{r, echo=FALSE, results='asis'} f <- function(u) { n <- c("smallsmall", "bigbig") x <- tim[c("bit", "merge", "merge"), u, n, ] dimnames(x)$M[3] <- "sort" dimnames(x)$N <- c("small", "big") x["sort", , "unsorted"] <- timsort["bit", "sort", "unsorted", ] x["sort", , "sorted"] <- 0 for (m in dimnames(x)$M) { x[m, , ] <- x[m, , ] / tim["R", u, n, ] * 100 } x } x <- f("unique") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for duplicated ```{r, echo=FALSE, results='asis'} x <- f("duplicated") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for anyDuplicated ```{r, echo=FALSE, results='asis'} x <- f("anyDuplicated") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for sumDuplicated ```{r, echo=FALSE, results='asis'} x <- f("sumDuplicated") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for match ```{r, echo=FALSE, results='asis'} f <- function(u) { x <- tim[c("bit", "merge", "merge"), u, , ] dimnames(x)$M[3] <- "sort" s <- timsort["bit", "sort", "unsorted", ] x["sort", , "unsorted"] <- rep(s, c(2, 2)) + c(s, s) x["sort", , "sorted"] <- 0 for (m in dimnames(x)$M) { x[m, , ] <- x[m, , ] / tim["R", u, , ] * 100 } x } x <- f("match") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for in ```{r, echo=FALSE, results='asis'} x <- f("in") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for notin ```{r, echo=FALSE, results='asis'} x <- f("notin") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for union ```{r, echo=FALSE, results='asis'} x <- f("union") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for intersect ```{r, echo=FALSE, results='asis'} x <- f("intersect") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for setdiff ```{r, echo=FALSE, results='asis'} x <- f("setdiff") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for symdiff ```{r, echo=FALSE, results='asis'} x <- f("symdiff") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for setequal ```{r, echo=FALSE, results='asis'} x <- f("setequal") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ### % time for setearly ```{r, echo=FALSE, results='asis'} x <- f("setearly") s <- "sorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ``` ```{r, echo=FALSE, results='asis'} s <- "unsorted" knitr::kable(x[, , s], caption=paste(s, "data relative to R"), digits=1) ```