Collection using data.table

I am looking for an easier way to aggregate and calculate percentages of a numeric variable with data.table. The following code displays the desired result, my question is is there a better way to get the same result. I am not very familiar with the package, so any advice would be helpful.

I would like to have the following columns:

   second_factor_variable third_factor_variable factor_variable       porc porcentaje
1:                   HIGH                     C           > 200 0.04456544        4 %
2:                    LOW                     A        51 - 100 0.31739130       32 %
3:                    LOW                     A       101 - 200 0.68260870       68 %
4:                    LOW                     A         26 - 50 0.00000000        0 %

Where porc is the numerical percentage and porcentage is the percentage value rounded for use as a label in the ggplot call.

library("ggplot2")
library("scales")
library("data.table")

### Generate some data
set.seed(123)
df <- data.frame(x = rnorm(10000, mean = 100, sd = 50))
df <- subset(df, x > 0)

df$factor_variable <- cut(df$x, right = TRUE, 
                          breaks = c(0, 25, 50, 100, 200, 100000),
                          labels = c("0 - 25", "26 - 50", "51 - 100", "101 - 200", "> 200")
                          )

df$second_factor_variable <- cut(df$x, right = TRUE, 
                                 breaks = c(0, 100, 100000),
                                 labels = c("LOW", "HIGH")
                                 )

df$third_factor_variable <- cut(df$x, right = TRUE, 
                                 breaks = c(0, 50, 100, 100000),
                                 labels = c("A", "B","C")
                                )

str(df)

### Aggregate
DT <- data.table(df)
dt = DT[, list(factor_variable = unique(DT$factor_variable),
              porc = as.numeric(table(factor_variable)/length(factor_variable)),
              porcentaje = paste( round( as.numeric(table(factor_variable)/length(factor_variable), 0 ) * 100 ), "%")
              ), by="second_factor_variable,third_factor_variable"]

EDIT

agstudy , , ( porcentaje). , , .

grp <- function(factor_variable) {
  porc = as.numeric(table(factor_variable)/length(factor_variable))
  list(factor_variable = factor_variable[1],
       porc =porc,
       porcentaje = paste( round( porc, 0 ) * 100 , "%"))
}

DT[, grp(factor_variable) , by="second_factor_variable"]

DT2 <- DT[DT$second_factor_variable %in% "LOW"]
table(DT2$factor_variable)/length(DT2$factor_variable)

, , :

DT[, grp(factor_variable) , by="second_factor_variable,third_factor_variable"]
+3
2

2 : porc DT factor_variable

DT[, {   porc = as.numeric(table(factor_variable)/length(factor_variable))
         list(factor_variable = factor_variable[1],
               porc =porc,
               porcentaje = paste( round( porc, 0 ) * 100 , "%"))
        }
, by="second_factor_variable,third_factor_variable"]
+4

- . .

, . , .

. , DT, , , . , porcentaje.

factor_variable = levels(factor_variable) 

grp2 <- function(factor_variable) {
  porc = as.numeric(table(factor_variable)/length(factor_variable))
  list(factor_variable = levels(factor_variable), 
       porc = porc,
       porcentaje = paste( round( as.numeric(table(factor_variable)/length(factor_variable), 0 ) * 100 ), "%")
       )
}


DT[, grp2(factor_variable) , by="second_factor_variable"]
0

All Articles