I would suggest looking at my cSplitfunction or starting the problem manually.
cSplit :
cSplit(data, "Values", ",")
:
Split <- strsplit(data$Values, ",", fixed = TRUE)
Ncol <- vapply(Split, length, 1L)
M <- matrix(NA_character_, nrow = nrow(data),
ncol = max(Ncol),
dimnames = list(NULL, paste0("V", sequence(max(Ncol)))))
M[cbind(rep(1:nrow(data), Ncol),
sequence(Ncol))] <- unlist(Split, use.names = FALSE)
data.table(ID = data$ID, M)
^^ cSplit, .., , ( ).
, "data.table" + "reshape2". , , , ID - , .
, " " ( ).
:
set.seed(1)
a <- sample(0:100, 100000, TRUE)
Values <- vapply(a, function(x)
paste(sample(0:100, x, TRUE), collapse = ","), character(1L))
Values[sample(length(Values), length(Values) * .15)] <- ""
ID <- c(1:80000, 1:20000)
data <- data.frame(ID, Values, stringsAsFactors = FALSE)
DT <- as.data.table(data)
, :
fun1a <- function(inDT) {
data2 <- DT[, list(Values = unlist(
strsplit(Values, ","))), by = ID]
data2[, Var := paste0("v", seq_len(.N)), by = ID]
dcast.data.table(data2, ID ~ Var,
fill = NA_character_,
value.var = "Values")
}
fun1b <- function(inDT) {
data2 <- DT[, list(Values = unlist(
strsplit(Values, ",", fixed = TRUE),
use.names = FALSE)), by = ID]
data2[, Var := paste0("v", seq_len(.N)), by = ID]
dcast.data.table(data2, ID ~ Var,
fill = NA_character_,
value.var = "Values")
}
fun2 <- function(inDT) {
cSplit(DT, "Values", ",")
}
fun3 <- function(inDF) {
Split <- strsplit(inDF$Values, ",", fixed = TRUE)
Ncol <- vapply(Split, length, 1L)
M <- matrix(NA_character_, nrow = nrow(inDF),
ncol = max(Ncol),
dimnames = list(NULL, paste0("V", sequence(max(Ncol)))))
M[cbind(rep(1:nrow(inDF), Ncol),
sequence(Ncol))] <- unlist(Split, use.names = FALSE)
data.table(ID = inDF$ID, M)
}
:
library(microbenchmark)
microbenchmark(fun2(DT), fun3(data), times = 20)
system.time(fun1a(DT))
system.time(fun1b(DT))
. fun1a fun1b fun2 fun3 - .