The fastest way to transfer a list to R / Rcpp
I have a list:
ls <- list(c("a", "b", "c"), c("1", "2", "3"), c("foo", "bar", "baz")) ls #> [[1]] #> [1] "a" "b" "c" #> [[2]] #> [1] "1" "2" "3" #> [[3]] #> [1] "foo" "bar" "baz" which I want to "transpose" to give:
resulting_ls #> [[1]] #> [1] "a" "1" "foo" #> [[2]] #> [1] "b" "2" "bar" #> [[3]] #> [1] "c" "3" "baz" I can achieve this with
mat <- matrix(unlist(ls), ncol = 3, byrow = TRUE) resulting_ls <- lapply(1:ncol(mat), function(i) mat[, i]) But with my real data, it is very slow ... (and I need to do this for many lists, each of which is much larger than the example above)
My question is:
What is the fastest way to do this for a large list of length(ls) and / or length(ls[[i]]) ?
- in
R(if this is no longer the case) - with
Rcpp
In the development version of data.table , v1.9.5, there is a transpose() function that does just that. It is implemented in C for speed.
require(data.table) # v1.9.5+ transpose(ls) # [[1]] # [1] "a" "1" "foo" # [[2]] # [1] "b" "2" "bar" # [[3]] # [1] "c" "3" "baz" It is also automatically filled with NA if the list items do not have the same length, and is also automatically entered in the top SEXPTYPE. If necessary, you can specify a different value in the fill argument. Check ?transpose .
Installation instructions to get v1.9.5 here .
"list" are R-objects without the equivalent of C, therefore, their processing in C will be more effective only from the point of view of the surrounding calculations, since the actual transposition will have to be returned between R-objects. Arun transpose is a concise approach to this problem and, apparently, could not be better. I will simply provide several alternatives to show that moving the โlistโ can be frustrating, and perhaps adopting a different approach to achieving the end goal may be better.
map = function(x) .mapply(c, x, NULL) lap = function(x) lapply(seq_along(x[[1]]), function(i) unlist(lapply(x, "[[", i))) library(data.table) DT = function(x) transpose(x) # very simple C loop that proves that `data.table::transpose` is as good as it gets loopC = inline::cfunction(sig = c(R_ls = "list"), body = ' SEXPTYPE tp = 0; SEXP ans, tmp; PROTECT(ans = allocVector(VECSXP, LENGTH(VECTOR_ELT(R_ls, 0)))); for(int i = 0; i < LENGTH(R_ls); i++) { tmp = VECTOR_ELT(R_ls, i); if(TYPEOF(tmp) > tp) tp = TYPEOF(tmp); } for(int i = 0; i < LENGTH(ans); i++) SET_VECTOR_ELT(ans, i, allocVector(tp, LENGTH(R_ls))); switch(tp) { case LGLSXP: case INTSXP: { for(int i = 0; i < LENGTH(R_ls); i++) { PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp)); int *ptmp = INTEGER(tmp); for(int j = 0; j < LENGTH(ans); j++) INTEGER(VECTOR_ELT(ans, j))[i] = ptmp[j]; UNPROTECT(1); } break; } case REALSXP: { for(int i = 0; i < LENGTH(R_ls); i++) { PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp)); double *ptmp = REAL(tmp); for(int j = 0; j < LENGTH(ans); j++) REAL(VECTOR_ELT(ans, j))[i] = ptmp[j]; UNPROTECT(1); } break; } case STRSXP: { for(int i = 0; i < LENGTH(R_ls); i++) { PROTECT(tmp = coerceVector(VECTOR_ELT(R_ls, i), tp)); for(int j = 0; j < LENGTH(ans); j++) SET_STRING_ELT(VECTOR_ELT(ans, j), i, STRING_ELT(tmp, j)); UNPROTECT(1); } break; } } UNPROTECT(1); return(ans); ') spl = function(x) split(unlist(x), rep(seq_along(x[[1]]), length(x))) map(ls) #[[1]] #[1] "a" "1" "foo" # #[[2]] #[1] "b" "2" "bar" # #[[3]] #[1] "c" "3" "baz" # lap(ls) #[[1]] #[1] "a" "1" "foo" # #[[2]] #[1] "b" "2" "bar" # #[[3]] #[1] "c" "3" "baz" # DT(ls) #[[1]] #[1] "a" "1" "foo" # #[[2]] #[1] "b" "2" "bar" # #[[3]] #[1] "c" "3" "baz" # loopC(ls) #[[1]] #[1] "a" "1" "foo" # #[[2]] #[1] "b" "2" "bar" # #[[3]] #[1] "c" "3" "baz" # spl(ls) #$`1` #[1] "a" "1" "foo" # #$`2` #[1] "b" "2" "bar" # #$`3` #[1] "c" "3" "baz" And the test:
myls1 = rep_len(list(sample(1e3), runif(1e3), sample(letters, 1e3, T)), 1e3) #1e3 x 1e3 myls2 = rep_len(list(sample(1e5), runif(1e5), sample(letters, 1e5, T)), 1e1) #10 x 1e5 myls3 = rep_len(list(sample(1e1), runif(1e1), sample(letters, 1e1, T)), 1e5) #1e5 x 10 identical(map(myls1), lap(myls1)) #[1] TRUE identical(map(myls1), DT(myls1)) #[1] TRUE identical(map(myls1), loopC(myls1)) #[1] TRUE identical(map(myls1), unname(spl(myls1))) #[1] TRUE microbenchmark::microbenchmark(map(myls1), lap(myls1), DT(myls1), loopC(myls1), spl(myls1), map(myls2), lap(myls2), DT(myls2), loopC(myls2), spl(myls2), map(myls3), lap(myls3), DT(myls3), loopC(myls3), spl(myls3), times = 10) #Unit: milliseconds # expr min lq median uq max neval # map(myls1) 1141.9477 1187.8107 1281.4314 1331.4490 1961.8452 10 # lap(myls1) 1082.7023 1104.6467 1182.8303 1219.5397 1695.6164 10 # DT(myls1) 378.0574 399.7339 433.4307 459.0293 495.2200 10 # loopC(myls1) 390.0305 392.5139 405.6461 480.7480 638.9145 10 # spl(myls1) 676.2639 756.1798 786.8639 821.7699 869.0219 10 # map(myls2) 1241.1010 1304.2250 1386.1915 1439.5182 1546.3835 10 # lap(myls2) 1823.2029 1922.1878 1965.6653 2006.6102 2161.9819 10 # DT(myls2) 471.5797 521.7380 554.2221 578.3043 887.1452 10 # loopC(myls2) 472.5713 494.9302 524.2538 591.0493 657.6087 10 # spl(myls2) 1108.1530 1117.7448 1212.0051 1297.8838 1336.8266 10 # map(myls3) 2005.1325 2178.3739 2214.1824 2451.7050 2539.5152 10 # lap(myls3) 1172.3033 1215.1297 1242.0294 1292.7345 1434.1707 10 # DT(myls3) 388.6679 393.5446 416.5494 479.1473 721.0758 10 # loopC(myls3) 389.4098 396.6768 404.9609 432.4390 451.8912 10 # spl(myls3) 675.7749 704.3328 767.0548 817.7189 937.1469 10