How to change heterogeneous double letters in r

I have a data frame:

DF = read.table(text="S01   S02     S03    S04    S05   S06
TT     CC     TT     CT     TT     00
AC     AA     AC     CC     AA     AA
CC     TC     CC     TT     CC     00
CC     AC     CC     AC     AA     CC
GG     00     TG     TT     GG     TG
GG     GA     GG     GA     GG     GG", header=T, stringsAsFactors=F)

I would like to change all heterogeneous values ​​(double letters) to double β€œ00” in a faster way. Expected Result:

S01   S02     S03    S04    S05   S06
TT     CC     TT     00     TT     00
00     AA     00     CC     AA     AA
CC     00     CC     TT     CC     00
CC     00     CC     00     AA     CC
GG     00     00     TT     GG     00
GG     00     GG     00     GG     GG

appreciate any help!

+4
source share
5 answers

You can use regex with negative appearance

as.data.frame(gsub("^(.)(?!\\1).$","00", as.matrix(DF), perl=T))
#   S01 S02 S03 S04 S05 S06
# 1  TT  CC  TT  00  TT  00
# 2  00  AA  00  CC  AA  AA
# 3  CC  00  CC  TT  CC  00
# 4  CC  00  CC  00  AA  CC
# 5  GG  00  00  TT  GG  00
# 6  GG  00  GG  00  GG  GG
+3
source

I am going to suggest that this is genetic data. This simplifies the creation of all heterogeneous base pairs and replaces them with a regular expression:

bases <-c("A","C","G","T")
b1 <- rep(bases, 4)
b2 <- rep(bases, each=4)
hetero<- paste0(b1[b1!=b2],b2[b2!=b1])

DF[] <- lapply(DF,gsub, pattern=paste0(hetero,collapse="|"),replacement="00")

OR

m <- as.matrix(DF)
m[m %in% hetero] <- "00"
res <- as.data.frame(m)

Benchmarks

Because benchmarking is fun, and there are many different solutions in this thread. An amazing conclusion: the differences are not very large, and the winner is David X (second second Conrad).

1000 1000 :

Unit: milliseconds
    expr      min       lq     mean   median       uq      max neval   cld
 MrFlick 402.0281 477.4867 494.6892 484.5600 504.6442 592.0486    50    d 
  Heroka 227.1143 298.8655 333.7875 309.4572 375.5734 459.6164    50   c  
 Heroka2 696.2465 710.0094 733.5981 717.8195 775.4891 803.7156    50     e
  DavidH 124.7802 127.9947 137.0511 130.3487 134.9696 210.5570    50 a    
  Konrad 144.0454 214.8844 231.9005 221.9659 291.3668 344.4238    50  b   
 Konrad2 699.5301 711.7724 750.1756 736.2112 787.4504 849.0606    50     e


#Data generated:

b1 <- rep(bases, 4)
b2 <- rep(bases, each=4)
all <- paste0(b1,b2)
largedat <- data.frame(matrix(sample(all,1000000,T),ncol=1000))

#benchmarking code

tests <- microbenchmark(
  MrFlick = MrFlick(largedat),
  Heroka = Heroka (largedat),
  Heroka2= Heroka2(largedat),
  DavidH=DavidH(largedat),
  Konrad = Konrad(largedat),
  Konrad2 = Konrad2(largedat),
  times=50)
#  Functions used:

MrFlick <- function(DF){
  as.data.frame(gsub("^(.)(?!\\1).$","00", as.matrix(DF), perl=T))
}

Heroka <- function(DF){
  bases <-c("A","C","G","T")
  b1 <- rep(bases, 4)
  b2 <- rep(bases, each=4)
  hetero<- paste0(b1[b1!=b2],b2[b2!=b1])
  m <- as.matrix(DF)
  m[m %in% hetero] <- "00"
  res <- as.data.frame(m)
  res
}

Heroka2 <- function(DF){
  DF[] <- lapply(DF,gsub, pattern=paste0(hetero,collapse="|"),replacement="00")
  DF
}

DavidH <- function(DF){
  ex <- expand.grid(c("A","T","C","G"),c("A","T","C","G"))
  ex <- ex[ex[1]!=ex[2],]
  het.combs <- apply(ex,1,function(i) {paste0(i[1],i[2])} )
  map <- setNames( rep("00",length(het.combs)) , het.combs )
  fac.df<- lapply(DF, as.factor)

  fac.df <- lapply(fac.df, function(i){levels(i)[levels(i) %in% names(map)] <- map[levels(i)[levels(i) %in% names(map)]];i } )
  DF <- as.data.frame(fac.df)
}

Konrad <- function(DF){
  bases = c('A', 'C', 'G', 'T')
  homozygous = apply(cbind(bases, bases), 1, paste, collapse = '')

  DF = as.matrix(DF)
  DF[! DF %in% homozygous] = '00'
  DF
}

Konrad2 <-function(DF){
  bases = c('A', 'C', 'G', 'T')
  homozygous = apply(cbind(bases, bases), 1, paste, collapse = '')
  DF = data.frame(lapply(DF, function (x) ifelse(x %in% homozygous, x, '00')))
}
+5

, , :

ex <- expand.grid(c("A","T","C","G"),c("A","T","C","G"))
ex <- ex[ex[1]!=ex[2],]
het.combs <- apply(ex,1,function(i) {paste0(i[1],i[2])} )
map <- setNames( rep("00",length(het.combs)) , het.combs )
fac.df<- lapply(DF, as.factor)

fac.df <- lapply(fac.df, function(i){levels(i)[levels(i) %in% names(map)] <- map[levels(i)[levels(i) %in% names(map)]];i } )
DF <- as.data.frame(fac.df)
+2

"" . , ( ), .

bases = c('A', 'C', 'G', 'T')
homozygous = apply(cbind(bases, bases), 1, paste, collapse = '')

DF = as.matrix(DF)
DF[! DF %in% homozygous] = '00'

ifelse . , , , . , - , .

DF = data.frame(lapply(DF, function (x) ifelse(x %in% homozygous, x, '00')))
+2

First, I would appreciate Heroka's comparative comparison, David's solution is faster, and MrFlick is a compressed script. I also thank all the other answers. Based on your decisions, I have a faster solution that combines the solutions of MrFlick and David H. When read DF,stringsAsFactors=T

DF <- data.frame(lapply(DF, function(x) {
  levels(x) <- gsub("^(.)(?!\\1).$","00", levels(x), perl=T)
  return(x)
}))
+2
source

All Articles