I am looking for an easier way to aggregate and calculate percentages of a numeric variable with data.table. The following code displays the desired result, my question is is there a better way to get the same result. I am not very familiar with the package, so any advice would be helpful.
I would like to have the following columns:
second_factor_variable third_factor_variable factor_variable porc porcentaje
1: HIGH C > 200 0.04456544 4 %
2: LOW A 51 - 100 0.31739130 32 %
3: LOW A 101 - 200 0.68260870 68 %
4: LOW A 26 - 50 0.00000000 0 %
Where porc is the numerical percentage and porcentage is the percentage value rounded for use as a label in the ggplot call.
library("ggplot2")
library("scales")
library("data.table")
set.seed(123)
df <- data.frame(x = rnorm(10000, mean = 100, sd = 50))
df <- subset(df, x > 0)
df$factor_variable <- cut(df$x, right = TRUE,
breaks = c(0, 25, 50, 100, 200, 100000),
labels = c("0 - 25", "26 - 50", "51 - 100", "101 - 200", "> 200")
)
df$second_factor_variable <- cut(df$x, right = TRUE,
breaks = c(0, 100, 100000),
labels = c("LOW", "HIGH")
)
df$third_factor_variable <- cut(df$x, right = TRUE,
breaks = c(0, 50, 100, 100000),
labels = c("A", "B","C")
)
str(df)
DT <- data.table(df)
dt = DT[, list(factor_variable = unique(DT$factor_variable),
porc = as.numeric(table(factor_variable)/length(factor_variable)),
porcentaje = paste( round( as.numeric(table(factor_variable)/length(factor_variable), 0 ) * 100 ), "%")
), by="second_factor_variable,third_factor_variable"]
EDIT
agstudy , , ( porcentaje). , , .
grp <- function(factor_variable) {
porc = as.numeric(table(factor_variable)/length(factor_variable))
list(factor_variable = factor_variable[1],
porc =porc,
porcentaje = paste( round( porc, 0 ) * 100 , "%"))
}
DT[, grp(factor_variable) , by="second_factor_variable"]
DT2 <- DT[DT$second_factor_variable %in% "LOW"]
table(DT2$factor_variable)/length(DT2$factor_variable)
, , :
DT[, grp(factor_variable) , by="second_factor_variable,third_factor_variable"]