Create a new column by combining several columns separated by a "-" in the data frame

How to create a new column by combining several columns with elements separated by "-" in the data frame.

df = read.table(text = "

   chr  start       end         CNA
    1  68580000    68640000    loss
    3  15360000    16000000    loss
    4  122660000   123500000   gain
    7  48320000    48400000    loss

", sep = "", header = TRUE)

Required Conclusion:

chr  start       end        CNA       sample.ID
1   68580000    68640000    loss    1-68580000-68640000
3   15360000    16000000    loss    3-15360000-16000000
4   122660000   123500000   gain    4-122660000-123500000
7   48320000    48400000    loss    7-48320000-48400000

I tried this

cols <- c("ID","seqnames","start")
df$sample.id  <- do.call(paste,rbind(final[cols]))

I am not sure how to split the characters in the new column by "-".

+4
source share
3 answers

You can try

cols <- c("chr", "start", "end")
df$sample.id <- do.call(paste, c(df[cols], sep="-"))
df
#  chr     start       end  CNA             sample.id
#1   1  68580000  68640000 loss   1-68580000-68640000
#2   3  15360000  16000000 loss   3-15360000-16000000
#3   4 122660000 123500000 gain 4-122660000-123500000
#4   7  48320000  48400000 loss   7-48320000-48400000

or

do.call(sprintf, c(df[cols], fmt='%s-%s-%s'))

Benchmarks

set.seed(24)
df1 <- as.data.frame(matrix(sample(0:1000, 3*1e6, replace=TRUE), ncol=3))

akrun1 <- function() {do.call(paste, c(df1, sep="-"))}
akrun2 <- function() {do.call(sprintf, c(df1, fmt='%s-%s-%s'))}
Mamoun <- function() {apply(df1, 1, paste0, collapse="-")}
library(microbenchmark)
microbenchmark(akrun1(), akrun2(), Mamoun(), unit='relative', times=20L)
#Unit: relative
#    expr      min       lq     mean   median       uq      max neval cld
#akrun1() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20  a 
#akrun2() 1.003460 1.034647 1.053498 1.052192 1.048754 1.076809    20  a 
#Mamoun() 6.706523 7.460565 6.657434 6.626459 6.416989 6.105361    20   b

data

df <- structure(list(chr = c(1L, 3L, 4L, 7L), start = c(68580000L, 
15360000L, 122660000L, 48320000L), end = c(68640000L, 16000000L, 
123500000L, 48400000L), CNA = c("loss", "loss", "gain", "loss"
)), .Names = c("chr", "start", "end", "CNA"), class = "data.frame", 
row.names = c(NA, -4L))
+2
source

you can also try this similar solution, apply

df$sample.ID <- apply(df[, -4], 1, paste0, collapse="-")
df
  chr     start       end  CNA             sample.ID
1   1  68580000  68640000 loss   1-68580000-68640000
2   3  15360000  16000000 loss   3-15360000-16000000
3   4 122660000 123500000 gain 4-122660000-123500000
4   7  48320000  48400000 loss   7-48320000-48400000
+2
source

unite() - library(tidyr), http://rpubs.com/bradleyboehmke/data_wrangling .

df <- unite(df, sample.id, chr, start, end, sep = "-")
+1

All Articles