The totality by row column name in R

I would like to group the data in data.frame with two columns and then summarize the specific third column. For instance:

> aggregate(mpg~gear+cyl, data=mtcars, FUN=sum)
  gear cyl   mpg
1    3   4  21.5
2    4   4 215.4
3    5   4  56.4
4    3   6  39.5
5    4   6  79.0
6    5   6  19.7
7    3   8 180.6
8    5   8  30.8

Now I need to do this several times for different columns. Therefore, I would like to write a function that generalizes this. It takes data.frame and one of the columns (to make everything simple) and does the same.

agg.data <- function(df, colname) {
  aggregate(mpg~gear+colname, data=df, FUN=sum) 
}

Running this will produce:

Error in eval(expr, envir, enclos) : object 'colname' not found

How to pass colname value for aggregation?

+4
source share
6 answers

Paste together the string representation of your formula and specify this line as an argument for the formula () ...

agg.data <- function(df, colname) {
  aggregate(formula(paste0("mpg~gear+", colname)), data=df, FUN=sum) 
}

> agg.data(mtcars, "cyl")
  gear cyl   mpg
1    3   4  21.5
2    4   4 215.4
3    5   4  56.4
4    3   6  39.5
5    4   6  79.0
6    5   6  19.7
7    3   8 180.6
8    5   8  30.8
+7
source

Usage data.table:

fun.dt <- function(dt, col) {
    dt[, .(mpg=sum(mpg)), by=c("gear", col)]
}

require(data.table)
dt = as.data.table(mtcars)
fun.dt(dt, "cyl")
#    gear cyl   mpg
# 1:    4   6  79.0
# 2:    4   4 215.4
# 3:    3   6  39.5
# 4:    3   8 180.6
# 5:    3   4  21.5
# 6:    5   4  56.4
# 7:    5   8  30.8
# 8:    5   6  19.7

by data.tables /. by.

+2

"" aggregate (.. ) . , :

agg.data2 <- function(df, colname) {
  aggregate(df[["mpg"]], list(df[["gear"]], df[[colname]]), FUN=sum) 
}
agg.data2(mtcars, "cyl")
#  Group.1 Group.2     x
#1       3       4  21.5
#2       4       4 215.4
#3       5       4  56.4
#4       3       6  39.5
#5       4       6  79.0
#6       5       6  19.7
#7       3       8 180.6
#8       5       8  30.8

dplyr:

library(dplyr)
agg.data.dplyr <- function(df, colname) {
  df %>%
    group_by_(.dots = c("gear", colname)) %>%
    summarise(sum = sum(mpg)) %>%
    ungroup()
}
agg.data.dplyr(mtcars, "cyl")
+1

deparse substitute

agg.data <- function(df, colname) {
  aggregate(df$mpg, list(df$gear, df[, deparse(substitute(colname))]), FUN=sum) 
}

agg.data(mtcars, cyl)
#   Group.1 Group.2     x
# 1       3       4  21.5
# 2       4       4 215.4
# 3       5       4  56.4
# 4       3       6  39.5
# 5       4       6  79.0
# 6       5       6  19.7
# 7       3       8 180.6
# 8       5       8  30.8
+1

ggplot with, , , substitute.

agg.data3 = function (df, colname){
    colname = substitute(colname)
    colname = as.character(colname)
    aggregate(formula(paste0("mpg~gear+", colname)), data=mtcars, FUN=sum)
}

agg.data3(cars, cyl)
0

You should probably find a function for this on tidyverse, however there is a function that I often use for aggregate aggregation. It includes most of what was discussed above with formula(), but in a more general form:

get.stat = function(df,var.nm,agg.id){

  #--- df       data.frame for aggregation
  #--- var.nm   target variable to be aggregated
  #--- agg.id   index name for aggretation (single value or vector)

  #--- generate aggregation formula
  agg.formula = paste(agg.id, collapse = "+")

  #--- df with summary results
  df.res = data.frame(mean    = aggregate(formula(paste0(var.nm,"~",agg.formula)), df, mean  )[,length(agg.id)+1],
                      median  = aggregate(formula(paste0(var.nm,"~",agg.formula)), df, median)[,length(agg.id)+1],
                      sd      = aggregate(formula(paste0(var.nm,"~",agg.formula)), df, sd    )[,length(agg.id)+1],
                      min     = aggregate(formula(paste0(var.nm,"~",agg.formula)), df, min   )[,length(agg.id)+1],
                      max     = aggregate(formula(paste0(var.nm,"~",agg.formula)), df, max   )[,length(agg.id)+1],
                      sum     = aggregate(formula(paste0(var.nm,"~",agg.formula)), df, sum   )[,length(agg.id)+1],
                      count   = aggregate(formula(paste0(var.nm,"~",agg.formula)), df, length)[,length(agg.id)+1])

  #--- bind indexers
  for(c in 1:length(agg.id)){
    df.res = cbind(df.res, aggregate(formula(paste0(var.nm,"~",agg.formula)), df, mean)[,c])
    colnames(df.res)[length(colnames(df.res))] = agg.id[c]
  }

  #--- re-order col
  df.res = df.res[,c(agg.id,colnames(df.res)[1:(length(colnames(df.res)) - c)])]

  return(df.res)
}

With this loaded function, you can simply:

get.stat(df, "mpg",c("gear","cyl"))

0
source

All Articles