How about this?
df[, 2:4][t(apply(df[,2:4], 1, duplicated))] <- NA
Edit: Faster basic solution:
for (i in 2:(ncol(df)-1)) { for (j in (i+1):ncol(df)) { chk <- df[[i]] == df[[j]] df[[j]][chk] <- NA } }
Here is an example of the two above methods along with AnandaMahto reshape2 and data.table for big data. Using for-loop with the correct indices i and j seems the fastest.
Benchmarking Results:
require(microbenchmark) microbenchmark(ar.f <- arun.f(df), ar.s <- arun.s(df), an.f <- ananda.ave(df), an.s <- ananda.dt(copy(DT)), times=10)
Data Creation:
set.seed(1234) df <- cbind(data.frame(ID = rep(letters[1:20], each=1e4)), stringsAsFactors=FALSE), matrix(sample(1:10, 6 * 1e5, replace=TRUE), ncol=3)) names(df)[2:4] <- paste0("code", 1:3)
My first version:
arun.f <- function(df) { df[, 2:4][t(apply(df[,2:4], 1, duplicated))] <- NA df }
My second version:
arun.s <- function(df) { for (i in 2:(ncol(df)-1)) { for (j in (i+1):ncol(df)) { chk <- df[[i]] == df[[j]] df[[j]][chk] <- NA } } df }
Ananda ave + reshape2 :
library(reshape2) ananda.ave <- function(df) { df$ID2 <- with(df, ave(ID, ID, FUN = seq_along)) m.df <- melt(df, id.vars=c("ID", "ID2")) m.df[duplicated(m.df[setdiff(names(m.df), "variable")]), "value"] <- NA dcast(m.df, ID + ID2 ~ variable) }
Ananda data.table :
(modified bit for greater optimization)
library(data.table) DT <- data.table(df) ananda.dt <- function(dt) { temp <- dt[, list(ID2 = 1:.N, Value = unlist(.SD, use.names=FALSE)), by ="ID"] temp[duplicated(temp), Value := NA] out <- setnames(temp[, as.list(Value), by=list(ID, ID2)], 3:5, paste0("code", 1:3)) }