Optimization of "set in the list of rows" for the operation "set as matrix"

Question

Optimization of "set in the list of rows" for the operation "set as matrix"

I have a set of strings that contain elements separated by a space. I want to build a matrix that tells me which elements were part of the rows. For example:

""
"A B C"
"D"
"B D"

Should give something like:

Now I have a solution, but it works slower, like a molasses, and I'm running out of ideas on how to do it faster:

reverseIn <- function(vector, value) {
    return(value %in% vector)
}

buildCategoryMatrix <- function(valueVector) {
    allClasses <- c()
    for(classVec in unique(valueVector)) {
        allClasses <- unique(c(allClasses,
                               strsplit(classVec, " ", fixed=TRUE)[[1]]))
    }

    resMatrix <- matrix(ncol=0, nrow=length(valueVector))
    splitValues <- strsplit(valueVector, " ", fixed=TRUE)

    for(cat in allClasses) {
        if(cat=="") {
            catIsPart <- (valueVector == "")
        } else {
            catIsPart <- sapply(splitValues, reverseIn, cat)
        }
        resMatrix <- cbind(resMatrix, catIsPart)
    }
    colnames(resMatrix) <- allClasses

    return(resMatrix)
}

Function profiling gives me the following:

$by.self
                  self.time self.pct total.time total.pct
"match"               31.20    34.74      31.24     34.79
"FUN"                 30.26    33.70      74.30     82.74
"lapply"              13.56    15.10      87.86     97.84
"%in%"                12.92    14.39      44.10     49.11

So, my current questions: - Where is 33% spent on "FUN"? - Was there a way to speed up the% in% call?

, , R. (IE, resMatrix [i, x] < - 1), - , x - . , , , .

+4

optimization string-matching r

Eric Fournier 25 . '13 15:46

3

vapply:

x <- c("" , "A B C" , "D" , "B D" )
lines <- strsplit(x, " ", fixed = TRUE)

all <- sort(unique(unlist(lines)))

t(vapply(lines, function(x) all %in% x, numeric(length(all))))

, @Ananda: https://gist.github.com/hadley/7169138

+4

hadley 26 . '13 12:52

. , out . , . , , . TRUE, FALSE. * 1L , integer, as.integer. sapply , row-wise, t().

data.frame .

#  Data
str <- c("" , "A B C" , "D" , "B D" )

#  Unique column headers (excluding empty strings as in example)
y <- unique( unlist( strsplit( str , " " ) ) )

#  Results
out <- t( sapply( str , function(x) y %in% unlist( strsplit( x , " " ) ) , USE.NAMES = FALSE ) ) * 1L

#  Combine to a data.frame
setNames( data.frame( out ) , y )
#  A B C D
#1 0 0 0 0
#2 1 1 1 0
#3 0 0 0 1
#4 0 1 0 1

+2

Simon O'Hanlon 25 . '13 16:10

A5C1D2H2I1M1N2O1R2T1 · Accepted Answer · 2013-10-25T16:18:53+0000

splitstackshape charBinaryMat, - :

( CRAN ):

charBinaryMat <- function(listOfValues, fill = NA) {
  lev <- sort(unique(unlist(listOfValues, use.names = FALSE)))
  m <- matrix(fill, nrow = length(listOfValues), ncol = length(lev))
  colnames(m) <- lev
  for (i in 1:nrow(m)) {
    m[i, listOfValues[[i]]] <- 1
  }
  m
}

, strsplit:

:

str <- c("" , "A B C" , "D" , "B D" )

## Fill is `NA` by default
charBinaryMat(strsplit(str, " ", fixed=TRUE))
#       A  B  C  D
# [1,] NA NA NA NA
# [2,]  1  1  1 NA
# [3,] NA NA NA  1
# [4,] NA  1 NA  1

## Can easily be set to another value
charBinaryMat(strsplit(str, " ", fixed=TRUE), fill = 0)
#      A B C D
# [1,] 0 0 0 0
# [2,] 1 1 1 0
# [3,] 0 0 0 1
# [4,] 0 1 0 1

, .

:

CBM <- function() {
  charBinaryMat(strsplit(str, " ", fixed=TRUE), fill = 0)
}
BCM <- function() {
  buildCategoryMatrix(str)*1L
}
Sapply <- function() {
  y <- unique( unlist( strsplit( str , " " ) ) )
  out <- t(sapply(str, function(x) y %in% unlist(strsplit(x , " " )),
                  USE.NAMES = FALSE )) * 1L
  colnames(out) <- y
  out
}

:

set.seed(1)
A = sample(10, 100000, replace = TRUE)
str <- sapply(seq_along(A), function(x)
  paste(sample(LETTERS[1:10], A[x]), collapse = " "))
head(str)
# [1] "H G C"               "F H J G"             "H D J A I B"        
# [4] "A C F H J B E G D I" "F C H"               "I C G B J D F A E"

:

## Automatically sorted
head(CBM())
#      A B C D E F G H I J
# [1,] 0 0 1 0 0 0 1 1 0 0
# [2,] 0 0 0 0 0 1 1 1 0 1
# [3,] 1 1 0 1 0 0 0 1 1 1
# [4,] 1 1 1 1 1 1 1 1 1 1
# [5,] 0 0 1 0 0 1 0 1 0 0
# [6,] 1 1 1 1 1 1 1 0 1 1

## Sorting just for comparison
head(BCM())[, LETTERS[1:10]]
#      A B C D E F G H I J
# [1,] 0 0 1 0 0 0 1 1 0 0
# [2,] 0 0 0 0 0 1 1 1 0 1
# [3,] 1 1 0 1 0 0 0 1 1 1
# [4,] 1 1 1 1 1 1 1 1 1 1
# [5,] 0 0 1 0 0 1 0 1 0 0
# [6,] 1 1 1 1 1 1 1 0 1 1

## Sorting just for comparison
head(Sapply())[, LETTERS[1:10]]
#      A B C D E F G H I J
# [1,] 0 0 1 0 0 0 1 1 0 0
# [2,] 0 0 0 0 0 1 1 1 0 1
# [3,] 1 1 0 1 0 0 0 1 1 1
# [4,] 1 1 1 1 1 1 1 1 1 1
# [5,] 0 0 1 0 0 1 0 1 0 0
# [6,] 1 1 1 1 1 1 1 0 1 1

:

library(microbenchmark)
microbenchmark(CBM(), BCM(), Sapply(), times=20)
# Unit: milliseconds
#      expr        min         lq     median         uq        max neval
#     CBM()   675.0929   718.3454   777.2423   805.3872   858.6609    20
#     BCM() 11059.6305 11267.9888 11367.3283 11595.1758 11792.5950    20
#  Sapply()  3536.7755  3687.0308  3759.7388  3813.4233  3968.3192    20

Optimization of "set in the list of rows" for the operation "set as matrix"

More articles: