In R, compare two lists of strings, find the parts of each element of the first list on the second

NOTE. I updated the question to reflect specific patterns in the data.

Let's say that I have two vectors.

   names_data <- c('A', 'B', 'C', 'D', 'E', 'F')
   levels_selected <- c('A1','A3', 'Blow', 'Bhigh', 'D(4.88e+03,9.18+e+04]', 'F')

I want to know how to get a vector, data frame, list or something else that checks the vector level and returns the levels, which of the selected variables. Something that says:

    A: 1, 3
    B: low, high
    D: (4.88e+03,9.18e+04]

Ultimately, there is a data frame Xfor which names_data = names(data)and levels_selectedare some, but not all of the levels in each of the variables. In the end, I want to make a matrix (like a random forest) using model.matrixwhere I want to include only variables and levels in levels_selected. Is there an easy way to do this?

+4
3

, , "names_data" "levels_selected" ( "grp" ), split , "grp", list.

grp <- sub(paste0("^(", paste(names_data, collapse="|"), ").*"), "\\1", levels_selected)
value <- gsub(paste(names_data, collapse="|"), "", 
               levels_selected)
lst <- split(value, grp)
lst
#$A
#[1] "1" "3"

#$B
#[1] "low"  "high"

#$D
#[1] "x"

library(qdapTools)
mtabulate(lst)
#  1 3 high low x
#A 1 1    0   0 0
#B 0 0    1   1 0
#D 0 0    0   0 1

strsplit

d1 <- as.data.frame(do.call(rbind, strsplit(levels_selected,
      paste0("(?<=(", paste(names_data, collapse="|"), "))"), 
             perl=TRUE)), stringsAsFactors=FALSE)
aggregate(V2~V1, d1, FUN= toString)
#  V1        V2
#1  A      1, 3
#2  B low, high
#3  D         x

, , model.matrix

model.matrix(~V1+V2-1, d1)

Update

OP

d1 <- as.data.frame(do.call(rbind, strsplit(levels_selected,
  paste0("(?<=(", paste(names_data, collapse="|"), "))"), 
         perl=TRUE)), stringsAsFactors=FALSE)
split(d1$V2, d1$V1) 
#$A
#[1] "1" "3"

#$B
#[1] "low"  "high"

#$D
#[1] "(4.88e+03,9.18+e+04]"

.

Update2

, 'names_data',

lst <- strsplit(levels_selected, paste0("(?<=(", paste(names_data, 
            collapse="|"), "))"), perl = TRUE)
d2 <-  as.data.frame(do.call(rbind,lst[lengths(lst)==2]), stringsAsFactors=FALSE)
split(d2$V2, d2$V1)
#$A
#[1] "1" "3"

#$B
#[1] "low"  "high"

#$D
#[1] "(4.88e+03,9.18+e+04]"
+1

, , :

> setNames(lapply(names_data, function(x) gsub(x, "", levels_selected[grepl(x, levels_selected)])), names_data)
$A
[1] "1" "3"

$B
[1] "low"  "high"

$C
character(0)

$D
[1] "x"

$E
character(0)
0

, , regexpr, perl

parseAll <- function(data, pattern) {

  result <- gregexpr(pattern, data, perl = TRUE)
  do.call(rbind,lapply(seq_along(data), function(i) {
  if(any(result[[i]] == -1)) return("")
  st <- data.frame(attr(result[[i]], "capture.start"))
  le <- data.frame(attr(result[[i]], "capture.length") - 1)

  mapply(function(start,leng) substring(data[i], start, start + leng), st, le)

}))
}

EDIT: , , . "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)" ( regexpr) , .

, , :

dat <- c('A1','A2','A3','B3')

parseAll(z,'A(?<A>.*)|B(?<B>.*)'), data.frame :

parseAll(dat,'A(?<A>.*)|B(?<B>.*)')

     A   B  
[1,] "1"  "" 
[2,] "2"  ""
[3,] "3"  "" 
[4,] ""   "3" 

( ), :

pattern <- paste(paste0(names_data,'(?<',names_data,'>.*)'),collapse = '|')

( data.frame, )

omnitool , ,

0

All Articles