Extract substring and numbers from string in R

Question

Extract substring and numbers from string in R

I have a few lines, here are some examples.

rfoutputtablep7q10000t20000c100
rfoutputtablep7q1000t20000c100
svmLinear2outputtablep7q20000t20000c100
svmLinear2outputtablep7q5000t20000c100

I want to create a data frame with the columns: algorithm, p, q, tand cand extract value from these lines. Thus, the material before "outputtable"is algorithm, the number after "p"is the value p, the number after "q"is the value q, etc.

How to create this data frame?

+6

regex r

Jack arnestad Nov 21 '17 at 16:18

source share

4 answers

library(stringr)
myd = c("p", "q", "t", "c")
data.frame(sapply(myd, function(a) str_extract(str_extract(x, paste0(a, "\\d+")), "\\d+")))
#  p     q     t   c
#1 7 10000 20000 100
#2 7  1000 20000 100
#3 7 20000 20000 100
#4 7  5000 20000 100

#For first column
substr(x, 1, unlist(gregexpr("outputtable", x)) - 1)
#[1] "rf"         "rf"         "svmLinear2" "svmLinear2"

DATA

x = c("rfoutputtablep7q10000t20000c100", "rfoutputtablep7q1000t20000c100", 
"svmLinear2outputtablep7q20000t20000c100", "svmLinear2outputtablep7q5000t20000c100")

+4

db Nov 21 '17 at 16:29

source share

:

gsub("^(\\w+)(?=outputtable).*", "\\1", string, perl=TRUE)

: https://regex101.com/r/7vDK1x/2

p, q, t c ( p (?<=p).

gsub(".*?(?<=q)(\\d+).*", "\\1", a, perl=TRUE)

+4

Mako212 21 . '17 16:29

, stringi. , . stringi R, , , , . , . , stringi . ( , , data.frame .)

: Rui Barradas . (i) , stringi, , , , . (ii) , , , ( ). , , . , , , ..

, stringi .

Please correct me if I watched anything fair comparison (especially, the solution stringrcould be improved with code, I think, but I'm not so familiar with the package, so I saved the proposed solution).

library(stringi)
library(stringr)
library(microbenchmark)

strings <- c("rfoutputtablep7q10000t20000c100",
              "rfoutputtablep7q1000t20000c100",
             "svmLinear2outputtablep7q20000t20000c100",
             "svmLinear2outputtablep7q5000t20000c100")


split_to_df <- function(string, splititems, colidschar, firstcolname, replsplit_tonames) {

   data <- as.data.frame(do.call(rbind
                                ,stri_split_regex(strings, paste(splititems, collapse = "|")))
                        ,stringsAsFactors = FALSE)
   names(data) <- c(firstcolname, stri_replace_all_regex(splititems, replsplit_tonames, ""))
   numericcols <- setdiff(1:ncol(data), colidschar)
   data[,numericcols] <- lapply(data[,numericcols], as.numeric)
   return(data)

}

stringi_approach_complete <- function() {

  df <- split_to_df(string = strings
                    ,splititems = c("outputtablep(?=\\d)", "q(?=\\d)", "t(?=\\d)", "c(?=\\d)")
                    ,colidschar = 1
                    ,firstcolname = "A"
                    ,replsplit_tonames = "\\(.*\\)|outputtable")
  # class(df$p)
  # [1] "numeric"
  # A p     q     t   c
  # 1         rf 7 10000 20000 100
  # 2         rf 7  1000 20000 100
  # 3 svmLinear2 7 20000 20000 100
  # 4 svmLinear2 7  5000 20000 100

}


stringi_approach_compare <- function() {

  data <- as.data.frame(do.call(rbind, stri_split_regex(strings, c("outputtable|p(?=\\d)|q(?=\\d)|t(?=\\d)|c(?=\\d)"))))
  names(data) <- c("A", "p", "q", "t", "c")
  #class(data$p)
  #[1] "factor"
  #data
  # A p     q     t   c
  # 1         rf 7 10000 20000 100
  # 2         rf 7  1000 20000 100
  # 3 svmLinear2 7 20000 20000 100
  # 4 svmLinear2 7  5000 20000 100

}


stringr_approach <- function() {

  res <- data.frame(p = str_extract(str_extract(strings, "p\\d+"), "\\d+"),
                    q = str_extract(str_extract(strings, "q\\d+"), "\\d+"),
                    t = str_extract(str_extract(strings, "t\\d+"), "\\d+"),
                    c = str_extract(str_extract(strings, "c\\d+"), "\\d+"))
  #class(res$p)
  #[1] "factor"
  #res
  # p     q     t   c
  # 1 7 10000 20000 100
  # 2 7  1000 20000 100
  # 3 7 20000 20000 100
  # 4 7  5000 20000 100

}

base_approach1 <- function() {

  res <- do.call(rbind, strsplit(strings, 'outputtable|p|q|t|c'))
  res <- as.data.frame(res[, -2])
  names(res) <- c("A", "p", "q", "t", "c")
  #class(res$p)
  #[1] "factor"
  #res[-1] <- lapply(res[-1], function(x) as.numeric(as.character(x)))
  #res
  #           A p     q     t   c
  #1         rf 7 10000 20000 100
  #2         rf 7  1000 20000 100
  #3 svmLinear2 7 20000 20000 100
  #4 svmLinear2 7  5000 20000 100


}

base_approach2 <- function() {

  df <- setNames(data.frame(do.call(rbind, strsplit(strings, 'outputtable\\D|p|q|t|c'))), c("A", "p", "q", "t", "c"))
  #class(df$p)
  #[1] "factor"
  #df
  # A p     q     t   c
  # 1         rf 7 10000 20000 100
  # 2         rf 7  1000 20000 100
  # 3 svmLinear2 7 20000 20000 100
  # 4 svmLinear2 7  5000 20000 100

}



microbenchmark(
  base_approach1(),
  base_approach2(),
  stringi_approach_compare(),
  stringr_approach(),
  stringi_approach_complete()

)

# Unit: microseconds
#         expr                 min       lq     mean   median       uq       max neval
# base_approach1()            260.139 273.3635 337.1985 285.6005 298.2330  5280.152   100
# base_approach2()            352.906 362.1820 461.8205 374.8140 391.9850  4645.791   100
# stringi_approach_compare()  280.667 297.8380 312.8426 307.3125 319.1545   654.098   100
# stringr_approach()          849.499 867.6570 956.7596 886.2100 923.7115  5651.609   100
# stringi_approach_complete() 319.747 333.9580 461.5521 346.7870 369.0900 10985.052   100

+2

Manuel bickel Nov 21 '17 at 16:48

source share

Rui barradas · Accepted Answer · 2017-11-21T16:35:15+0000

Only use of base R.

res <- do.call(rbind, strsplit(y, 'outputtable|p|q|t|c'))
res <- as.data.frame(res[, -2])
res[-1] <- lapply(res[-1], function(x) as.numeric(as.character(x)))
names(res) <- c("algorithm", "p", "q", "t", "c")
res
#   algorithm p     q     t   c
#1         rf 7 10000 20000 100
#2         rf 7  1000 20000 100
#3 svmLinear2 7 20000 20000 100
#4 svmLinear2 7  5000 20000 100

DATA

y <- scan(text = '"rfoutputtablep7q10000t20000c100"
"rfoutputtablep7q1000t20000c100"
"svmLinear2outputtablep7q20000t20000c100"
"svmLinear2outputtablep7q5000t20000c100"',
what = character())

Extract substring and numbers from string in R

More articles: