I am going to suggest that this is genetic data. This simplifies the creation of all heterogeneous base pairs and replaces them with a regular expression:
bases <-c("A","C","G","T")
b1 <- rep(bases, 4)
b2 <- rep(bases, each=4)
hetero<- paste0(b1[b1!=b2],b2[b2!=b1])
DF[] <- lapply(DF,gsub, pattern=paste0(hetero,collapse="|"),replacement="00")
OR
m <- as.matrix(DF)
m[m %in% hetero] <- "00"
res <- as.data.frame(m)
Benchmarks
Because benchmarking is fun, and there are many different solutions in this thread. An amazing conclusion: the differences are not very large, and the winner is David X (second second Conrad).
1000 1000 :
Unit: milliseconds
expr min lq mean median uq max neval cld
MrFlick 402.0281 477.4867 494.6892 484.5600 504.6442 592.0486 50 d
Heroka 227.1143 298.8655 333.7875 309.4572 375.5734 459.6164 50 c
Heroka2 696.2465 710.0094 733.5981 717.8195 775.4891 803.7156 50 e
DavidH 124.7802 127.9947 137.0511 130.3487 134.9696 210.5570 50 a
Konrad 144.0454 214.8844 231.9005 221.9659 291.3668 344.4238 50 b
Konrad2 699.5301 711.7724 750.1756 736.2112 787.4504 849.0606 50 e
b1 <- rep(bases, 4)
b2 <- rep(bases, each=4)
all <- paste0(b1,b2)
largedat <- data.frame(matrix(sample(all,1000000,T),ncol=1000))
tests <- microbenchmark(
MrFlick = MrFlick(largedat),
Heroka = Heroka (largedat),
Heroka2= Heroka2(largedat),
DavidH=DavidH(largedat),
Konrad = Konrad(largedat),
Konrad2 = Konrad2(largedat),
times=50)
MrFlick <- function(DF){
as.data.frame(gsub("^(.)(?!\\1).$","00", as.matrix(DF), perl=T))
}
Heroka <- function(DF){
bases <-c("A","C","G","T")
b1 <- rep(bases, 4)
b2 <- rep(bases, each=4)
hetero<- paste0(b1[b1!=b2],b2[b2!=b1])
m <- as.matrix(DF)
m[m %in% hetero] <- "00"
res <- as.data.frame(m)
res
}
Heroka2 <- function(DF){
DF[] <- lapply(DF,gsub, pattern=paste0(hetero,collapse="|"),replacement="00")
DF
}
DavidH <- function(DF){
ex <- expand.grid(c("A","T","C","G"),c("A","T","C","G"))
ex <- ex[ex[1]!=ex[2],]
het.combs <- apply(ex,1,function(i) {paste0(i[1],i[2])} )
map <- setNames( rep("00",length(het.combs)) , het.combs )
fac.df<- lapply(DF, as.factor)
fac.df <- lapply(fac.df, function(i){levels(i)[levels(i) %in% names(map)] <- map[levels(i)[levels(i) %in% names(map)]];i } )
DF <- as.data.frame(fac.df)
}
Konrad <- function(DF){
bases = c('A', 'C', 'G', 'T')
homozygous = apply(cbind(bases, bases), 1, paste, collapse = '')
DF = as.matrix(DF)
DF[! DF %in% homozygous] = '00'
DF
}
Konrad2 <-function(DF){
bases = c('A', 'C', 'G', 'T')
homozygous = apply(cbind(bases, bases), 1, paste, collapse = '')
DF = data.frame(lapply(DF, function (x) ifelse(x %in% homozygous, x, '00')))
}