Quick check to see if one vector is part of another vector

I have a list of vectors, say vecs = list(vec1=1:3, vec2=1:5, vec3=2:6, vec4=1:7). I want to delete all members of the list if it is contained in another element of the list. For example, vecs$vec1is part vecs$vec2, (or vecs$vec4), so I want to delete it.

I want a very fast implementation, because it is length(vecs)very large. What I did was first sort the member vecsby length vecs = vecs[ order(unlist(lapply(vecs, length))) ], then to check_member = vecs[i]check if it is part vecs[ i+1 ], vecs[ i+2 ].... Is there any better strategy? Full code:

vecs = list(vec1=1:3, vec2=1:5, vec3=2:6, vec4=1:7, vec5=2:3)
vecs = vecs[ order(unlist(lapply(vecs, length))) ] ##sort by member length
vecs_len = length(vecs)
toRemove = numeric(vecs_len ) ##record whether to remove this member
for( i in 1:(vecs_len-1 ))
for( j in (i+1):(vecs_len ))
{
   if( length( setdiff(vecs[[i]],vecs[[j]]) )==0  ) {toRemove[i] = 1; break} ##check whether vecs[[i]] is part of vecs[[j]]

}
vecs = vecs[!toRemove]
+4
source share
3 answers

, . , . , ( lapply(vecs, unique), )

vecs <- list(vec1=letters[1:3],
             vec2=letters[1:5],
             vec3=letters[2:6],
             vec4=letters[1:7])

-, (.. ), :

vlevels <- unique(unlist(vecs))
nlev    <- length(vlevels)
fvecs   <- lapply(vecs, factor, levels = vlevels)

0 1 / :

vtabs <- vapply(fvecs, tabulate, integer(nlev), nbins = nlev)
#      vec1 vec2 vec3 vec4
# [1,]    1    1    0    1
# [2,]    1    1    1    1
# [3,]    1    1    1    1
# [4,]    0    1    1    1
# [5,]    0    1    1    1
# [6,]    0    0    1    1
# [7,]    0    0    0    1

, , , , , .

num.match <- crossprod(vtabs)
vlen <- sapply(vecs, length)
is.subset.mat <- num.match == vlen
diag(is.subset.mat) <- FALSE
#       vec1  vec2  vec3  vec4
# vec1 FALSE  TRUE FALSE  TRUE
# vec2 FALSE FALSE FALSE  TRUE
# vec3 FALSE FALSE FALSE  TRUE
# vec4 FALSE FALSE FALSE FALSE

, , , :

is.subset <- rowSums(is.subset.mat) > 0L
# vec1  vec2  vec3  vec4 
# TRUE  TRUE  TRUE FALSE 

:

out <- vecs[!is.subset]

, , chunk-by-chunk, .

+4

, , , . %in% any , setdiff length.

# Your code
fun1 <- function(vecs){
  vecs_len = length(vecs)
  toRemove = numeric(vecs_len ) ##record whether to remove this member
  for( i in 1:(vecs_len-1 ))
    for( j in (i+1):(vecs_len ))
    {
      if( length( setdiff(vecs[[i]],vecs[[j]]) )==0  ) toRemove[i] = 1 ##check whether vecs[[i]] is part of vecs[[j]]

    }
  vecs = vecs[!toRemove]
  return(vecs)
}

# flodel code
fun2 <- function(vecs){
  vlevels <- unique(unlist(vecs))
  nlev    <- length(vlevels)
  fvecs   <- lapply(vecs, factor, levels = vlevels)
  vtabs <- vapply(fvecs, tabulate, integer(nlev), nbins = nlev)
  num.match <- crossprod(vtabs)
  vlen <- sapply(vecs, length)
  is.subset.mat <- num.match == vlen
  diag(is.subset.mat) <- FALSE
  is.subset <- rowSums(is.subset.mat) > 0L
  out <- vecs[!is.subset]
  return(out)  
}

# slight modification
fun3 <- function(vecs){
  vecs_len = length(vecs)
  toRemove = numeric(vecs_len ) ##record whether to remove this member
  for( i in 1:(vecs_len-1 ))
    for( j in (i+1):(vecs_len ))
    {
      if( any(vecs[[i]] %in% vecs[[j]])  ) toRemove[i] = 1 ##check whether vecs[[i]] is part of vecs[[j]]

    }
  vecs = vecs[!toRemove]
  return(vecs)
}

microbenchmark(fun1(vecs), fun2(vecs), fun3(vecs), times=100L)
Unit: microseconds
       expr     min       lq      mean   median       uq     max neval
 fun1(vecs) 154.919 166.4245 179.62297 172.5880 180.3950 356.681   100
 fun2(vecs) 220.255 231.5555 279.99874 237.7185 290.9335 609.398   100
 fun3(vecs)  50.544  54.0370  64.86082  57.3250  64.5155 291.345   100
+1

, - . , . , , ( ). https://stat.ethz.ch/pipermail/r-help/2005-March/068491.html % in%.

dfSubsets <- list()

ds <- lapply(vecs,function(x){
        subsets = 0
        lapply( vecs, function(y){
            if(all(x %in% y)){subsets <<- subsets + 1}
        })
        if(subsets > 1){
            dfSubsets <<- c(dfSubsets,list(x))
        }
}
)

dfSubsets

, , .

I'm sure dplyr might also help, but I'm not familiar with this.

Applying usually leads to increased efficiency - of course, if you only care about whether it is a subset of one vector, then sometimes it will be faster to use a for and break loop, but you cannot know this in advance.

This method works for strings or integers.

0
source

All Articles