matrix vs data.frame vs data.table

Summary : Matrix and data.table are very close in performance. data.frame should be avoid in data and operation intensive cases

1 Benchmark

1.1 Row appending

library(microbenchmark) # for microbenchmark
library(data.table)

#----------

mat0 <- matrix(ncol = 100, nrow = 1000)
df0 <- data.frame(mat0)
dt0 <- data.table(mat0)

newRow <- 1:100
loops <- 1:5000 # ranking results may different depending on the loop size

# ---------

# rbind
mat_rbind <- function(){
  mat <- mat0
  for (i in loops) {
    mat <- rbind(mat, newRow)
  }
}

# rbind
df_rbind <- function(){
  df <- df0
  for (i in loops) {
    df <- rbind(df, newRow)
  }
}

# insert
df_insert <- function(){
  df <- df0
  for (i in loops) {
    df[nrow(df)+1,] <- newRow
  }
}

# rbind
dt_rbind <- function(){
  dt <- dt0
  for (i in loops) {
    dt <- rbind(dt, as.list(newRow))
  }
}

# rbindlist
dt_rbindlist <- function(){
  dt <- dt0
  for (i in loops) {
    dt <- rbindlist(list(dt, as.list(newRow) ) )
  }
}
res <- microbenchmark(
  mat_rbind()
  , df_rbind()
  , df_insert()
  , dt_rbind()
  , dt_rbindlist()
  , times = 50
)

print(res)
# screen shot of result
Unit: seconds
     expr      min       lq     mean   median       uq       max         neval  cld
   mat_rbind() 29.61510 29.91575 30.20403 30.08436 30.20831  35.67312    50  b  
    df_rbind() 87.38882 88.11399 89.57409 88.34435 88.73782 112.14921    50    d
   df_insert() 48.18800 48.56083 49.34227 48.84676 49.45061  58.04525    50   c 
    dt_rbind() 25.22989 25.42573 25.73572 25.53962 25.71169  33.92170    50 a   
dt_rbindlist() 24.56441 24.74358 25.47515 24.81044 25.02734  37.53949    50 a  

1.2 Row deletion

mat0 <- matrix(ncol = 100, nrow = 10000)
df0 <- data.frame(mat0)
dt0 <- data.table(mat0)

loops <- 1:5000

#-----------------------

mat_delete <- function(){
  mat <- mat0
  for (i in loops) {
    mat <- mat[-1, ]
  }
}

mat_tail <- function(){
  mat <- mat0
  for (i in loops) {
    mat <- tail(mat, nrow(mat)-1 )
  }
}

df_delete <- function(){
  df <- df0
  for (i in loops) {
    df <- df[-1, ]
  }
}

df_tail <- function(){
  df <- df0
  for (i in loops) {
    df <- tail(df, nrow(df)-1 )
  }
}

dt_delete <- function(){
  dt <- dt0
  for (i in loops) {
    dt <- dt[-1, ]
  }
}

dt_tail <- function(){
  dt <- dt0
  for (i in loops) {
    dt <- tail(dt, nrow(dt)-1 )
  }
}
res <- microbenchmark(
  mat_delete()
  , mat_tail()
  , df_delete()
  , df_tail()
  , dt_delete()
  , dt_tail()
  , times = 50
)

print(res)
## Unit: seconds
##          expr      min       lq     mean   median       uq      max neval 
##  mat_delete() 34.70892 35.27614 35.58425 35.43062 35.67061 38.85541    50  a
##    mat_tail() 38.09419 38.92384 39.26626 39.12545 39.33041 42.16310    50   b
##   df_delete() 47.66720 47.85000 48.04832 47.99169 48.12198 49.68098    50
##     df_tail() 74.40627 75.18312 75.42762 75.40833 75.54480 76.55802    50
##   dt_delete() 40.48840 40.72822 40.85509 40.87139 40.90501 41.73669    50
##     dt_tail() 41.30999 41.58012 41.68345 41.64761 41.75733 42.54702    50

1.3 Row selection

mat0 <- matrix(ncol = 50, nrow = 10000)
mat0[, 1] <- 1:10000 
mat0[, 2] <- 1:10000 

df0 <- data.frame(mat0)
dt0 <- data.table(mat0) 

loops <- 1000:6000 # loop 5k times
#----------------------

mat_select <- function(){
  sum <- 0
  for (i in loops) {
    sum <- sum(mat0[mat0[,1]>i & mat0[,1]<i+50, 2]) + sum
  }
  print(sum)
}

mat_subset <- function(){
  sum <- 0
  for (i in loops) {
    sum <- sum(subset(mat0[,2], mat0[,1]>i & mat0[,1]<i+50)) + sum
  }
  print(sum)
}

df_select <- function(){
  sum <- 0
  for (i in loops) {
    sum <- sum(df0[df0[,1]>i & df0[,1]<i+50, 2]) + sum
  }
  print(sum)
}

df_subset <- function(){
  sum <- 0
  for (i in loops) {
    sum <- sum(subset(df0[,2], df0[,1]>i & df0[,1]<i+50)) + sum
  }
  print(sum)
}

dt_select <- function(){
  sum <- 0
  for (i in loops) {
    sum <- sum(dt0[dt0$V1>i & dt0$V1<i+50, "V2", with=FALSE]) + sum
  }
  print(sum)
}

dt_subset <- function(){
  sum <- 0
  for (i in loops) {
    sum <- sum(subset(dt0$V2, dt0$V1>i & dt0$V1<i+50)) + sum
  }
  print(sum)
}
res <- microbenchmark(
  mat_select()
  , mat_subset()
  , df_select()
  , df_subset()
  , dt_select()
  , dt_subset()
  , times = 50
)

print(res)
## Unit: seconds
##          expr      min       lq     mean   median       uq      max neval
##  mat_select() 1.991257 2.253587 2.362204 2.381161 2.480467 2.620633    50
##  mat_subset() 3.274175 3.483381 3.587154 3.564870 3.684579 3.879181    50
##   df_select() 1.014388 1.267431 1.310773 1.337663 1.391615 1.484692    50
##   df_subset() 1.355735 1.576415 1.652202 1.683493 1.716798 1.831174    50
##   dt_select() 3.768900 3.934258 4.118989 4.096022 4.292722 4.591615    50
##   dt_subset() 1.423912 1.572870 1.632260 1.663860 1.708612 1.828782    50
^Home Page^