Scroll through the data.table and create new columns in a specific state

I have a data table with many columns. I need to skip them and create new columns using some condition. I am currently writing a separate condition line for each column. Let me explain an example. Consider an example of data like -

set.seed(71)

DT <- data.table(town = rep(c('A','B'), each=10),
                 tc = rep(c('C','D'), 10),
                 one = rnorm(20,1,1),
                 two = rnorm(20,2,1),
                 three = rnorm(20,3,1),
                 four = rnorm(20,4,1),
                 five = rnorm(20,5,2),
                 six = rnorm(20,6,2),
                 seven = rnorm(20,7,2),
                 total = rnorm(20,28,3))

For each column from one to a common, I need to create 4 new columns, i.e. medium, sd, uplimit, lowlimit for calculating 2 sigma-outlier. I'm doing it -

DTnew <- DT[, as.list(unlist(lapply(.SD, function(x) list(mean = mean(x), sd = sd(x), uplimit = mean(x)+1.96*sd(x), lowlimit = mean(x)-1.96*sd(x))))), by = .(town,tc)]

This is DTnew data.table. Then I team up with my DT

DTmerge <- merge(DT, DTnew, by= c('town','tc'))

Now, to come up with outliers, I write a separate set of codes for each variable -

DTAoutlier <- DTmerge[ ,one.Aoutlier := ifelse (one >= one.lowlimit & one <= one.uplimit,0,1)]
DTAoutlier <- DTmerge[ ,two.Aoutlier := ifelse (two >= two.lowlimit & two <= two.uplimit,0,1)]
DTAoutlier <- DTmerge[ ,three.Aoutlier := ifelse (three >= three.lowlimit & three <= three.uplimit,0,1)]

can anyone help simplify this code so

  • outlier. 8 , , 100 , 100 ? for? ?

  • data.table , . , , 3 10. DTlog, DT. DT DT.

    DTlog <- DT[,(lapply(.SD,log)),by = .(town,tc),.SDcols=3:10]

.

+4
3

, :=. , ( "nm" ). vector outer ('nm1'). OP, unlist (:=) "nm1" .

nm <- names(DT)[-(1:2)]

nm1 <- c(t(outer(c("Mean", "SD", "uplimit", "lowlimit"), nm, paste, sep="_")))

DT[, (nm1):= unlist(lapply(.SD, function(x) { Mean = mean(x)
                                  SD = sd(x)
                     uplimit = Mean + 1.96*SD
                     lowlimit = Mean - 1.96*SD
             list(Mean, SD, uplimit, lowlimit) }), recursive=FALSE) ,
                    .(town, tc)]

. , "lowlimit" "uplimit" ( ), , +. outlier.

m1 <- +(DT[, nm, with = FALSE] >= DT[, paste("lowlimit", nm, sep="_"), 
          with = FALSE] & DT[, nm, with = FALSE] <= DT[, 
            paste("uplimit", nm, sep="_"), with = FALSE])
DT[,paste(nm, "Aoutlier", sep=".") := as.data.frame(m1)]

data.tables for set ( )

nm2 <- paste(nm, "Aoutlier", sep=".")
DT[, (nm2) := NA_integer_]
for(j in nm){
 set(DT, i = NULL, j = paste(j, "Aoutlier", sep="."), 
   value = as.integer(DT[[j]] >= DT[[paste("lowlimit", j, sep="_")]] & 
           DT[[j]] <= DT[[paste("uplimit", j, sep="_")]]))
 }

"" :=

DT[,paste(nm, "log", sep=".") := lapply(.SD,log),by = .(town,tc),.SDcols=nm]
+4

, , :

m = melt(DT, id=c("town","tc"))

m[, 
  is_outlier := +(abs(value-mean(value)) > 1.96*sd(value))
, by=.(town, tc, variable)]

( outlier):

m[, .N, by=is_outlier] # this is a handy alternative to table()

#    is_outlier   N
# 1:          0 160

  • melt id
    • variable ( )
    • value ( )
  • +x , as.integer(x), TRUE/FALSE 1/0

:

vjs = setdiff(names(DT), c("town","tc"))
DT[, 
  paste0(vjs,".out") := lapply(.SD, function(x) +(abs(x-mean(x)) > 1.96*sd(x)))
, by=.(town, tc), .SDcols=vjs]
+3

, dplyr mutate_each :

library(dplyr)

result <- DT %>%
    group_by(town,tc) %>%
    mutate_each(funs(mean,sd,
                     uplimit = (mean(.) + 1.96*sd(.)),
                     lowlimit = (mean(.) - 1.96*sd(.)),
                     Aoutlier = as.integer(. >= mean(.) - 1.96*sd(.) &
                                               . <= mean(.) - 1.96*sd(.))),
                -town,-tc)
0
source

All Articles