Merge strings between strings by id

Question

Merge strings between strings by id

I want to combine the lines between the lines of the id variable. I know how to do this with the code Rbelow. However, my code seems too complicated.

In this case, each line has two elements that are not points. Each pair of consecutive lines inside an identifier has one common element. Thus, only one of these elements remains after combining the two lines.

The desired result is displayed, and the code Rbelow returns the desired result. Thank you for any suggestions. Sorry, my code is Rso long and confusing, but it works, and my goal is to get more efficient code in the database R.

my.data <- read.table(text = '
     id         my.string
      2    11..................
      2    .1...2..............
      2    .....2...3..........
      5    ....................
      6    ......2.....2.......
      6    ............2...4...
      7    .1...2..............
      7    .....2....3.........
      7    ..........3..3......
      7    .............34.....
      8    ....1.....1.........
      8    ..........12........
      8    ...........2....3...
      9    ..................44
     10    .2.......2..........
     11    ...2...2............
     11    .......2.....2......
     11    .............2...2..
', header = TRUE, na.strings = 'NA', stringsAsFactors = FALSE)
my.data

desired.result <- read.table(text = '
     id         my.string
      2    11...2...3..........
      5    ....................
      6    ......2.....2...4...
      7    .1...2....3..34.....
      8    ....1.....12....3...
      9    ..................44
     10    .2.......2..........
     11    ...2...2.....2...2..
', header = TRUE, na.strings = 'NA', stringsAsFactors = FALSE)

# obtain position of first and last non-dot
# from: http://stackoverflow.com/questions/29229333/position-of-first-and-last-non-dot-in-a-string-with-regex

first.last.dot <- data.frame(my.data, do.call(rbind, gregexpr("^\\.*\\K[^.]|[^.](?=\\.*$)", my.data[,2], perl=TRUE)))

# obtain non-dot elements
first.last.dot$first.element <- as.numeric(substr(first.last.dot$my.string, first.last.dot$X1, first.last.dot$X1))
first.last.dot$last.element  <- as.numeric(substr(first.last.dot$my.string, first.last.dot$X2, first.last.dot$X2))

# obtain some book-keeping variables
first.last.dot$number.within.group <- sequence(rle(first.last.dot$id)$lengths)
most.records.per.id                <- max(first.last.dot$number.within.group)
n.ids                              <- length(unique(first.last.dot$id))

# create matrices for recording data
positions.per.id <- matrix(NA, nrow = (n.ids), ncol=(most.records.per.id+1))
values.per.id    <- matrix(NA, nrow = (n.ids), ncol=(most.records.per.id+1))

# use nested for-loops to fill matrices with data
positions.per.id[1,1] = first.last.dot$X1[1]
   values.per.id[1,1] = first.last.dot$first.element[1]

positions.per.id[1,2] = first.last.dot$X2[1]
   values.per.id[1,2] = first.last.dot$last.element[1]

j = 1

for(i in 2:nrow(first.last.dot)) {

     if(first.last.dot$id[i] != first.last.dot$id[i-1]) j = j + 1

      positions.per.id[j, (first.last.dot$number.within.group[i]+0)] = first.last.dot$X1[i]
      positions.per.id[j, (first.last.dot$number.within.group[i]+1)] = first.last.dot$X2[i]

      values.per.id[j, (first.last.dot$number.within.group[i]+0)] = first.last.dot$first.element[i]
      values.per.id[j, (first.last.dot$number.within.group[i]+1)] = first.last.dot$last.element[i]
}

# convert matrix data into new strings using nested for-loops
new.strings <- matrix(0, nrow = nrow(positions.per.id), ncol = nchar(my.data$my.string[1]))

for(i in 1:nrow(positions.per.id)) {
     for(j in 1:ncol(positions.per.id)) {

          new.strings[i,positions.per.id[i,j]] <- values.per.id[i,j]
     }
}

# format new strings
new.strings[is.na(new.strings)] <- 0
new.strings[new.strings==0]     <- '.'

new.strings2 <- data.frame(id = unique(first.last.dot$id), my.string = (do.call(paste0, as.data.frame(new.strings))), stringsAsFactors = FALSE)
new.strings2

all.equal(desired.result, new.strings2)
# [1] TRUE

+4

string merge regex r

Mark miller Mar 26 '15 at 20:30

3

R , , . data.table ( 1.9.5 github, tstrsplit):

library(data.table)
dt = as.data.table(my.data) # or setDT to convert in place

dt[, paste0(lapply(tstrsplit(my.string, ""),
                   function(i) {
                     res = i[i != "."];
                     if (length(res) > 0)
                       res[1]
                     else
                       '.'
                   }), collapse = "")
   , by = id]
#   id                   V1
#1:  2 11...2...3..........
#2:  5 ....................
#3:  6 ......2.....2...4...
#4:  7 .1...2....3..34.....
#5:  8 ....1.....12....3...
#6:  9 ..................44
#7: 10 .2.......2..........
#8: 11 ...2...2.....2...2..

+2

eddi 26 . '15 20:58

It is possible to use functions from packages stringiand dplyr:

library(stringi)
library(dplyr)

# split my.string
m <- stri_split_boundaries(my.data$my.string, type = "character", simplify = TRUE)

df <- data.frame(id = my.data$id, m)

# function to apply to each column - select "." or unique "number"
myfun <- function(x) if(all(x == ".")) "." else unique(x[x != "."])


df %>%
  # for each id...
  group_by(id) %>%

  # ...and each column, apply function
  summarise_each(funs(myfun)) %>%

  # for each row...
  rowwise() %>%

 #...concatenate strings 
  do(data.frame(id = .[1], mystring = paste(.[-1], collapse = "")))

#   id             mystring
# 1  2 11...2...3..........
# 2  5 ....................
# 3  6 ......2.....2...4...
# 4  7 .1...2....3..34.....
# 5  8 ....1.....12....3...
# 6  9 ..................44
# 7 10 .2.......2..........
# 8 11 ...2...2.....2...2..

+2

Henrik Mar 26 '15 at 21:53

source share

bgoldst · Accepted Answer · 2015-03-26T21:13:46+0000

, . , , .

data.frame(id=unique(my.data$id), my.string=sapply(lapply(unique(my.data$id), function(id) gsub('^$','.',substr(gsub('\\.','',do.call(paste0,strsplit(my.data[my.data$id==id,'my.string'],''))),1,1)) ), function(x) paste0(x,collapse='') ), stringsAsFactors=F );

, :

lapply():

lapply(unique(my.data$id), function(id) ... )

, data.frame, . :

gsub('^$','.',substr(gsub('\\.','',do.call(paste0,strsplit(my.data[my.data$id==id,'my.string'],''))),1,1))

, :

strsplit(my.data[my.data$id==id,'my.string'],'')

my.string id strsplit(). list , , , . , , .

, ( id == 2):

[[1]]
 [1] "1" "1" "." "." "." "." "." "." "." "." "." "." "." "." "." "." "." "." "." "."

[[2]]
 [1] "." "1" "." "." "." "2" "." "." "." "." "." "." "." "." "." "." "." "." "." "."

[[3]]
 [1] "." "." "." "." "." "2" "." "." "." "3" "." "." "." "." "." "." "." "." "." "."

strsplit() ( ..., ):

do.call(paste0,...)

paste0() , , strsplit() . , , , :

 [1] "1.." "11." "..." "..." "..." ".22" "..." "..." "..." "..3" "..." "..." "..." "..." "..." "..." "..." "..." "..." "..."

paste0() :

gsub('\\.','',...)

, - , :

 [1] "1"  "11" ""   ""   ""   "22" ""   ""   ""   "3"  ""   ""   ""   ""   ""   ""   ""   ""   ""   ""

gsub() :

substr(...,1,1)

, , , . , , .

substr() :

gsub('^$','.',...)

, , , , . , id == 2:

 [1] "1" "1" "." "." "." "2" "." "." "." "3" "." "." "." "." "." "." "." "." "." "."

, lapply(). , list , . , , , :

sapply(..., function(x) paste0(x,collapse='') )

sapply() (simplify-apply) , , :

[1] "11...2...3.........." "...................." "......2.....2...4..." ".1...2....3..34....." "....1.....12....3..." "..................44" ".2.......2.........." "...2...2.....2...2.."

, , , data.frame, data.frame:

data.frame(id=unique(my.data$id), my.string=..., stringsAsFactors=F )

:

  id            my.string
1  2 11...2...3..........
2  5 ....................
3  6 ......2.....2...4...
4  7 .1...2....3..34.....
5  8 ....1.....12....3...
6  9 ..................44
7 10 .2.......2..........
8 11 ...2...2.....2...2..

!

Merge strings between strings by id

More articles: