Summarizing multiple columns with dplyr? [duplicate]

In dplyr (>=1.00) you may use across(everything() in summarise to apply a function to all variables:

library(dplyr)

df %>% group_by(grp) %>% summarise(across(everything(), list(mean)))
#> # A tibble: 3 x 5
#>     grp     a     b     c     d
#>   <int> <dbl> <dbl> <dbl> <dbl>
#> 1     1  3.08  2.98  2.98  2.91
#> 2     2  3.03  3.04  2.97  2.87
#> 3     3  2.85  2.95  2.95  3.06

Alternatively, the purrrlyr package provides the same functionality:

library(purrrlyr)
df %>% slice_rows("grp") %>% dmap(mean)
#> # A tibble: 3 x 5
#>     grp     a     b     c     d
#>   <int> <dbl> <dbl> <dbl> <dbl>
#> 1     1  3.08  2.98  2.98  2.91
#> 2     2  3.03  3.04  2.97  2.87
#> 3     3  2.85  2.95  2.95  3.06

Also don’t forget about data.table (use keyby to sort sort groups):

library(data.table)
setDT(df)[, lapply(.SD, mean), keyby = grp]
#>    grp        a        b        c        d
#> 1:   1 3.079412 2.979412 2.979412 2.914706
#> 2:   2 3.029126 3.038835 2.967638 2.873786
#> 3:   3 2.854701 2.948718 2.951567 3.062678

Let’s try to compare performance.

library(dplyr)
library(purrrlyr)
library(data.table)
library(bench)
set.seed(123)
n <- 10000
df <- data.frame(
  a = sample(1:5, n, replace = TRUE), 
  b = sample(1:5, n, replace = TRUE), 
  c = sample(1:5, n, replace = TRUE), 
  d = sample(1:5, n, replace = TRUE), 
  grp = sample(1:3, n, replace = TRUE)
)
dt <- setDT(df)
mark(
  dplyr = df %>% group_by(grp) %>% summarise(across(everything(), list(mean))),
  purrrlyr = df %>% slice_rows("grp") %>% dmap(mean),
  data.table = dt[, lapply(.SD, mean), keyby = grp],
  check = FALSE
)
#> # A tibble: 3 x 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr        2.81ms   2.85ms      328.        NA     17.3
#> 2 purrrlyr     7.96ms   8.04ms      123.        NA     24.5
#> 3 data.table 596.33µs 707.91µs     1409.        NA     10.3

Leave a Comment