basic R-solution without the use of data frames:
# split z <- unlist(strsplit(m,'_')) M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m)) # properly order columns i <- 1:ncol(M) M <- M[,order(c(i[c(T,F)],i[c(F,T)]))] # set dimnames rownames(M) <- rownames(m) colnames(M) <- rep(colnames(m),each=2) # 9178 9178 3574 3574 3547 3547 # 160 "B" "B" "A" "B" "B" "A" # 301 "B" "A" "A" "B" "B" "B" # 303 "B" "B" "A" "B" "B" "A" # 311 "A" "A" "A" "B" "B" "A" # 312 "B" "A" "A" "B" "B" "A" # 314 "B" "A" "A" "B" "B" "A"
[Update] Here is a small comparative study of the proposed solutions (I did not include the cSplit solution because it was too slow):
Setup:
m <- matrix('A_B',nrow=1000,ncol=2830) d <- as.data.frame(m, stringsAsFactors = FALSE) ##### f.mtrx <- function(m) { z <- unlist(strsplit(m,'_')) M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m)) # properly order columns i <- 1:ncol(M) M <- M[,order(c(i[c(T,F)],i[c(F,T)]))] # set dimnames rownames(M) <- rownames(m) colnames(M) <- rep(colnames(m),each=2) M } library(stringi) f.mtrx2 <- function(m) { z <- unlist(stri_split_fixed(m,'_')) M <- matrix(c(z[c(T,F)],z[c(F,T)]),nrow=nrow(m)) # properly order columns i <- 1:ncol(M) M <- M[,order(c(i[c(T,F)],i[c(F,T)]))] # set dimnames rownames(M) <- rownames(m) colnames(M) <- rep(colnames(m),each=2) M } ##### library(splitstackshape) f.cSplit <- function(mydf) cSplit(as.data.table(mydf, keep.rownames = TRUE), names(mydf), "_") ##### library(stringi) f.stringi <- function(mydf) `dimnames<-`(do.call(cbind, lapply(mydf, stri_split_fixed, "_", simplify = TRUE)), list(rownames(mydf), rep(colnames(mydf), each = 2))) ##### library(dplyr) library(tidyr) f.dplyr <- function(df) lapply(names(df), function(x) separate_(df[x], x, paste0(x,"_",1:2), sep = "_" )) %>% bind_cols ##### library(iotools) f.mstrsplit <- function(mydf) `dimnames<-`(do.call(cbind, lapply(mydf, mstrsplit, "_", ncol = 2, type = "character")), list(rownames(mydf), rep(colnames(mydf), each = 2))) ##### library(rbenchmark) benchmark(f.mtrx(m), f.mtrx2(m), f.dplyr(d), f.stringi(d), f.mstrsplit(d), replications = 10)
Results:
test replications elapsed relative user.self sys.self user.child sys.child 3 f.dplyr(d) 10 27.722 10.162 27.360 0.269 0 0 5 f.mstrsplit(d) 10 2.728 1.000 2.607 0.098 0 0 1 f.mtrx(m) 10 37.943 13.909 34.885 0.799 0 0 2 f.mtrx2(m) 10 15.176 5.563 13.936 0.802 0 0 4 f.stringi(d) 10 8.107 2.972 7.815 0.247 0 0
In the updated test, the winner is f.mstrsplit .