In R, compare two lists of strings, find the parts of each element of the first list on the second

Question

In R, compare two lists of strings, find the parts of each element of the first list on the second

NOTE. I updated the question to reflect specific patterns in the data.

Let's say that I have two vectors.

   names_data <- c('A', 'B', 'C', 'D', 'E', 'F')
   levels_selected <- c('A1','A3', 'Blow', 'Bhigh', 'D(4.88e+03,9.18+e+04]', 'F')

I want to know how to get a vector, data frame, list or something else that checks the vector level and returns the levels, which of the selected variables. Something that says:

    A: 1, 3
    B: low, high
    D: (4.88e+03,9.18e+04]

Ultimately, there is a data frame Xfor which names_data = names(data)and levels_selectedare some, but not all of the levels in each of the variables. In the end, I want to make a matrix (like a random forest) using model.matrixwhere I want to include only variables and levels in levels_selected. Is there an easy way to do this?

+4

r subset

Mathuser May 15, '16 at 4:24

3

, , :

> setNames(lapply(names_data, function(x) gsub(x, "", levels_selected[grepl(x, levels_selected)])), names_data)
$A
[1] "1" "3"

$B
[1] "low"  "high"

$C
character(0)

$D
[1] "x"

$E
character(0)

0

Psidom 15 '16 4:52

, , regexpr, perl

parseAll <- function(data, pattern) {

  result <- gregexpr(pattern, data, perl = TRUE)
  do.call(rbind,lapply(seq_along(data), function(i) {
  if(any(result[[i]] == -1)) return("")
  st <- data.frame(attr(result[[i]], "capture.start"))
  le <- data.frame(attr(result[[i]], "capture.length") - 1)

  mapply(function(start,leng) substring(data[i], start, start + leng), st, le)

}))
}

EDIT: , , . "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)" ( regexpr) , .

, , :

dat <- c('A1','A2','A3','B3')

parseAll(z,'A(?<A>.*)|B(?<B>.*)'), data.frame :

parseAll(dat,'A(?<A>.*)|B(?<B>.*)')

     A   B  
[1,] "1"  "" 
[2,] "2"  ""
[3,] "3"  "" 
[4,] ""   "3"

( ), :

pattern <- paste(paste0(names_data,'(?<',names_data,'>.*)'),collapse = '|')

( data.frame, )

omnitool , ,

0

Shape 15 '16 5:04

akrun · Accepted Answer · 2016-05-15T04:48:39+0000

, , "names_data" "levels_selected" ( "grp" ), split , "grp", list.

grp <- sub(paste0("^(", paste(names_data, collapse="|"), ").*"), "\\1", levels_selected)
value <- gsub(paste(names_data, collapse="|"), "", 
               levels_selected)
lst <- split(value, grp)
lst
#$A
#[1] "1" "3"

#$B
#[1] "low"  "high"

#$D
#[1] "x"

library(qdapTools)
mtabulate(lst)
#  1 3 high low x
#A 1 1    0   0 0
#B 0 0    1   1 0
#D 0 0    0   0 1

strsplit

d1 <- as.data.frame(do.call(rbind, strsplit(levels_selected,
      paste0("(?<=(", paste(names_data, collapse="|"), "))"), 
             perl=TRUE)), stringsAsFactors=FALSE)
aggregate(V2~V1, d1, FUN= toString)
#  V1        V2
#1  A      1, 3
#2  B low, high
#3  D         x

, , model.matrix

model.matrix(~V1+V2-1, d1)

Update

OP

d1 <- as.data.frame(do.call(rbind, strsplit(levels_selected,
  paste0("(?<=(", paste(names_data, collapse="|"), "))"), 
         perl=TRUE)), stringsAsFactors=FALSE)
split(d1$V2, d1$V1) 
#$A
#[1] "1" "3"

#$B
#[1] "low"  "high"

#$D
#[1] "(4.88e+03,9.18+e+04]"

.

Update2

, 'names_data',

lst <- strsplit(levels_selected, paste0("(?<=(", paste(names_data, 
            collapse="|"), "))"), perl = TRUE)
d2 <-  as.data.frame(do.call(rbind,lst[lengths(lst)==2]), stringsAsFactors=FALSE)
split(d2$V2, d2$V1)
#$A
#[1] "1" "3"

#$B
#[1] "low"  "high"

#$D
#[1] "(4.88e+03,9.18+e+04]"

In R, compare two lists of strings, find the parts of each element of the first list on the second

Update

Update2

More articles: