UPDATE: VERSION 3
Found an even faster way. This feature also has high memory performance. The main reason for the previous function was slow due to copy / assignment occurring inside the lapply loop, as well as the rbinding result.
In the next version, we preliminary distribute the matrix with the appropriate size, and then change the values ββin the corresponding coordinates, which makes it very fast compared to other versions of the cycle.
funcGT3 <- function() { #Get list of column names in result resCol <- unique(dt[, unlist(strsplit(messy_string, split="\\$"))]) #Get dimension of result nresCol <- length(resCol) nresRow <- nrow(dt) #Create empty matrix with dimensions same as desired result mat <- matrix(rep(0, nresRow * nresCol), nrow = nresRow, dimnames = list(as.character(1:nresRow), resCol)) #split each messy_string by $ ll <- strsplit(dt[,messy_string], split="\\$") #Get coordinates of mat which we need to set to 1 coords <- do.call(rbind, lapply(1:length(ll), function(i) cbind(rep(i, length(ll[[i]])), ll[[i]] ))) #Set mat to 1 at appropriate coordinates mat[coords] <- 1 #Bind the mat to original data.table return(cbind(dt, mat)) } result <- funcGT3() #result for 1000 rows in dt result ID messy_string zn tc sv db yx st ze qs wq oe cv ut is kh kk im le qg rq po wd kc un ft ye if zl zt wy et rg iu 1: 1 zn$tc$sv$db$yx 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2: 2 st$ze$qs$wq 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3: 3 oe$cv$ut$is 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4: 4 kh$kk$im$le$qg 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5: 5 rq$po$wd$kc 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 --- 996: 996 rp$cr$tb$sa 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 997: 997 cz$wy$rj$he 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 998: 998 cl$rr$bm 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 999: 999 sx$hq$zy$zd 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000: 1000 bw$cw$pw$rq 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
Ricardo's second test version 2 (this is for 250K lines):
Unit: seconds expr min lq median uq max neval GT2 104.68672 104.68672 104.68672 104.68672 104.68672 1 GT3 15.15321 15.15321 15.15321 15.15321 15.15321 1
VERSION 1 Below is version 1 of the proposed answer
set.seed(10) elements_list <- c(outer(letters, letters, FUN = paste, sep = "")) random_string <- function(min_length, max_length, separator) { selection <- paste(sample(elements_list, ceiling(runif(1, min_length, max_length))), collapse = separator) return(selection) } dt <- data.table(ID = c(1:1000), messy_string = "") dt[ , messy_string := random_string(2, 5, "$"), by = ID] myFunc <- function() { ll <- strsplit(dt[,messy_string], split="\\$") COLS <- do.call(rbind, lapply(1:length(ll), function(i) { data.frame( ID= rep(i, length(ll[[i]])), COL = ll[[i]], VAL= rep(1, length(ll[[i]])) ) } ) ) res <- as.data.table(tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length )) dt <- cbind(dt, res) for (j in names(dt)) set(dt,which(is.na(dt[[j]])),j,0) return(dt) } create_indicators <- function(search_list, searched_string) { y <- rep(0, length(search_list)) for(j in 1:length(search_list)) { x <- regexpr(search_list[j], searched_string) x <- x[1] y[j] <- ifelse(x > 0, 1, 0) } return(y) } OPFunc <- function() { indicators <- matrix(0, nrow = nrow(dt), ncol = length(elements_list)) for(n in 1:nrow(dt)) { indicators[n, ] <- dt[n, create_indicators(elements_list, messy_string)] } indicators <- data.table(indicators) setnames(indicators, elements_list) dt <- cbind(dt, indicators) return(dt) } library(plyr) plyrFunc <- function() { indicators = do.call(rbind.fill, sapply(1:dim(dt)[1], function(i) dt[i, data.frame(t(as.matrix(table(strsplit(messy_string, split = "\\$"))))) ])) dt = cbind(dt, indicators)
REFERENCE
system.time(res <- myFunc()) ## user system elapsed ## 1.01 0.00 1.01 system.time(res2 <- OPFunc()) ## user system elapsed ## 21.58 0.00 21.61 system.time(res3 <- plyrFunc()) ## user system elapsed ## 1.81 0.00 1.81
VERSION 2: Suggested by Ricardo
I post this here and not in my answer as the structure is really @ GeekTrader's -Rick _
myFunc.modified <- function() { ll <- strsplit(dt[,messy_string], split="\\$")