Data.table updates the last item in a group based on a condition

Question

Data.table updates the last item in a group based on a condition

I have data.table with 3 columns: id, time and status. For each identifier, I want to find the record with the maximum time - then if the status for this record is true, I want to set it to false if the time is> 7 (for example). I do it as follows.

x <- data.table(id=c(1,1,2,2),time=c(5,6,7,8),status=c(FALSE,TRUE,FALSE,TRUE)) setkey(x,id,time) y <- x[,.SD[.N],by=id] x[y,status:=status & time > 7]

I have a lot of data that I work with and would like to speed up this operation. Any suggestions would be appreciated!

+7

r data.table

user2506086 Mar 03 '16 at 0:29

source share

4 answers

Rentrop · Answer 1 · 2016-03-03T00:49:43+0000

 x[x[,.N, by=id][,cumsum(N)], status := status * time <=7]

If I'm not mistaken, this is not a union, since x[,.N, by=id][,cumsum(N)] returns the row indices of the last elements per group.

Update: After looking at the speed comparison, this seems to be the winner and should be listed first.

This was my initial attempt, which turned out to be the slowest of all the proposed solutions.

 x[,status := c(.SD[-.N, status], .SD[.N, status * time <=7]), by=id]

SymbolixAU · Answer 2 · 2016-03-03T00:37:39+0000

One data.table approach

 x[ x[order(time), .I[.N], by=id]$V1 , status := ifelse(time > 7, FALSE, TRUE)] > x # id time status #1: 1 5 FALSE #2: 1 6 TRUE #3: 2 7 FALSE #4: 2 8 FALSE

as x[order(time), .I[.N], by=id]$V1 gives us the row index maximum time for each group ( id )

And by going from @ Floo0's answer, we can simplify it a bit to

 x[ x[order(time), .I[.N], by=id]$V1 , status := status * time <= 7]

Speed comparison

Checking the speed of various responses (and storing keys on data)

 set.seed(123) x <- data.table(id=c(rep(seq(1:10000), each=10)), time=c(rep(seq(1:10000), 10)), status=c(sample(c(TRUE, FALSE), 10000*10, replace=T))) setkey(x,id,time) x1 <- copy(x); x2 <- copy(x); x3 <- copy(x); x4 <- copy(x); x5 <- copy(x); x6 <- copy(x) library(microbenchmark) microbenchmark( Symbolix = {x1[ x1[order(time), .I[.N], by=id]$V1 , status := status * time < 7 ] }, Floo0_1 = {x2[,status := c(.SD[-.N, status], .SD[.N, status * time > 7]), by=id]}, Floo0_2 = {x3[x3[,.N, by=id][,cumsum(N)], status := status * time > 7]}, Original = { y <- x4[,.SD[.N],by=id] x4[y,status:=status & time > 7] }, Frank = { y <- x5[, .SD[.N, .(time, status)], by=id][time > 7 & status] x5[y, status := FALSE] }, thelatemail = {x6[ x6[,.I==.I[which.max(time)], by=id]$V1 & time > 7, status := FALSE]} ) Unit: milliseconds expr min lq mean median uq max neval cld Symbolix 5.419768 5.857477 6.514111 6.222118 6.936000 11.284580 100 a Floo0_1 4550.314775 4710.679867 4787.086279 4776.794072 4850.334011 5097.136148 100 c Floo0_2 1.653419 1.792378 1.945203 1.881609 2.014325 4.096006 100 a Original 10.052947 10.986294 12.541595 11.431182 12.391287 89.494783 100 a Frank 4609.115061 4697.687642 4743.886186 4735.086113 4785.212543 4932.270602 100 b thelatemail 10.300864 11.594972 12.421889 12.315852 12.984146 17.630736 100 a

thelatemail · Answer 3 · 2016-03-03T02:11:20+0000

Another attempt:

 x[ x[,.I==.I[which.max(time)], by=id]$V1 & time > 7, status := FALSE] x # id time status #1: 1 5 FALSE #2: 1 6 TRUE #3: 2 7 FALSE #4: 2 8 FALSE

Frank · Answer 4 · 2016-03-03T05:56:39+0000

Here's another way similar to OP:

 y = unique(x[,c("id","time"), with=FALSE], by="id", fromLast=TRUE) x[y[time > 7], status := FALSE]

Here is another test:

 n_id = 1e3; n_col = 100; n_draw = 5 set.seed(1) X = data.table(id = 1:n_id)[, .( time = sample(10,n_draw), status = sample(c(T,F), n_draw, replace=TRUE) ), by=id][, paste0("V",1:n_col) := 0] setkey(X,id,time) X1 = copy(X); X2 = copy(X); X3 = copy(X); X4 = copy(X) X5 = copy(X); X6 = copy(X); X7 = copy(X); X8 = copy(X) library(microbenchmark) library(multcomp) microbenchmark( unique = { Y = unique(X1[,c("id","time"), with=FALSE], by="id", fromLast=TRUE) X1[Y[time > 7], status := FALSE] }, OP = { y <- X2[,.SD[.N],by=id] X2[y,status:=status & time > 7] }, Floo0a = X3[,status := c(.SD[-.N, status], .SD[.N, status * time >7]), by=id], Floo0b = X4[X4[,.N, by=id][,cumsum(N)], status := status * time >7], tlm = X5[ X5[,.I==.I[which.max(time)], by=id]$V1 & time > 7, status := FALSE], Symbolix=X6[ X6[order(time), .I[.N], by=id]$V1 , status := ifelse(time > 7, FALSE, TRUE)], Frank1 = { y <- X7[, .SD[.N, .(time, status)], by=id][time > 7 & status] X7[y, status := FALSE] }, Frank2 = { y <- X8[, .SD[.N], by=id][time > 7 & status] X8[y, status := FALSE] }, times = 1, unit = "relative")

Result:

  expr min lq mean median uq max neval unique 1.348592 1.348592 1.348592 1.348592 1.348592 1.348592 1 OP 35.048724 35.048724 35.048724 35.048724 35.048724 35.048724 1 Floo0a 416.175654 416.175654 416.175654 416.175654 416.175654 416.175654 1 Floo0b 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1 tlm 2.151996 2.151996 2.151996 2.151996 2.151996 2.151996 1 Symbolix 1.770835 1.770835 1.770835 1.770835 1.770835 1.770835 1 Frank1 404.045660 404.045660 404.045660 404.045660 404.045660 404.045660 1 Frank2 36.603303 36.603303 36.603303 36.603303 36.603303 36.603303 1

Data.table updates the last item in a group based on a condition

More articles: