R - Effectively create a variable indicating whether the variable precedes the date (by group)

I have two dates ( date1and date2) and a variable idin the data.frame file:

dat <- data.frame(c('2014-02-11', '2014-05-04', '2014-05-22'), c('2014-04-12', '2014-09-22', '2014-07-04'), c('a', 'a', 'b'))
names(dat) <- c('date1', 'date2', 'id')
dat$date1 <- as.character.Date(dat$date1, format = '%Y-%m-%d')
dat$date2 <- as.character.Date(dat$date2, format = '%Y-%m-%d')
> dat
       date1      date2 id
1 2014-02-11 2014-04-12  a
2 2014-05-04 2014-09-22  a
3 2014-05-22 2014-07-04  b

I would like to create a new variable varthat indicates whether the date value is any before the date date2value date1for this string (and not just the value date2immediately preceding it):

> dat
       date1      date2 id var
1 2014-02-11 2014-04-12  a   0
2 2014-05-04 2014-09-22  a   1
3 2014-05-22 2014-07-04  b   0

I was able to achieve this with the following loop:

ids <- as.vector(unique(unlist(dat$id)))
dat$var <- as.numeric(0)
for (i in ids) {
  date2s <- as.vector(unlist(filter(dat, id == i)$date2))
  for (j in date2s) {
    dat <- dat %>% mutate(var = replace(var, (j < date1) & (id == i), 1)) # if any cdate precedes rdate
  }
}

However, my data set is quite large, and I would like to achieve this using data.table, if possible, although I would be happy to approach this using dplyr, if there is an effective approach.

+6
4

...

library(data.table)

frank_first = function() dat[, v0 := as.logical(copy(.SD)[copy(.SD), on=.(id, date2 < date1), mult="first", .N, by=.EACHI]$N)]

frank_which = function() dat[, vw := !is.na(copy(.SD)[copy(.SD), on=.(id, date2 < date1), mult="first", which=TRUE])]

frank_any = function() dat[, v1 := .SD[copy(.SD), on=.(id, date2 < date1), .N, by=.EACHI]$N > 0L]

frank_min = function() dat[, v := as.logical(.SD[, min(date2), by=id][copy(.SD), on=.(id, V1 < date1), .N, by=.EACHI]$N)]

fun = function(x, y) x > min(y)
mtm <- function(df) {
    df$var <- NA  # new column, to be updated
    split(df$var, df$id) <-
        Map(fun, split(df$date1, df$id), split(df$date2, df$id))
    df
}

copy - /.

+ :

set.seed(2L)
N <- 1e5
ng = 1e4
dat <- data.table(date1=sample(seq(as.Date("1970-01-01"), Sys.Date(), by="1 day"), N, replace=TRUE), 
    date2=sample(seq(as.Date("1970-01-01"), Sys.Date(), by="1 day"), N, replace=TRUE),
    id=sample(ng, N, replace=TRUE))

df = data.frame(dat)

microbenchmark::microbenchmark(frank_first(), frank_which(), frank_any(), frank_min(), mtm(df), times=5L)

Unit: milliseconds
          expr       min        lq      mean    median        uq       max neval cld
 frank_first()  70.38654  70.72610  80.37284  73.33607  86.87363 100.54186     5  a 
 frank_which()  55.90631  57.16385  62.89525  61.82535  64.63895  74.94178     5  a 
   frank_any()  38.56254  39.42893  40.53816  39.85976  41.47074  43.36885     5  a 
   frank_min()  36.73850  36.90551  62.55768  45.44839  55.41056 138.28545     5  a 
       mtm(df) 186.44924 190.26654 209.38918 219.73829 224.06300 226.42884     5   b

, ( ) .

+5

.EACHI , @thelatemail

dat[dat, .(date1=i.date1, date2=i.date2, var=any(date2 < i.date1)), by=.EACHI, on=.(id)]

#   id      date1      date2   var
#1:  a 2014-02-11 2014-04-12 FALSE
#2:  a 2014-05-04 2014-09-22  TRUE
#3:  b 2014-05-22 2014-07-04 FALSE

:

set.seed(2L)
N <- 1e5
dat <- data.table(date1=sample(seq(as.Date("1970-01-01"), Sys.Date(), by="1 day"), N, replace=TRUE), 
    date2=sample(seq(as.Date("1970-01-01"), Sys.Date(), by="1 day"), N, replace=TRUE),
    id=sample(letters, N, replace=TRUE))

dt1 <- copy(dat)
tlmMtd <- function() {
    dt1[, rownum := .I]
    dt1[dt1[dt1, on="id", rownum[i.date2 < date1], allow.cartesian=TRUE], hit := 1]
}

dt2 <- copy(dat)
csMtd <- function() dt2[dt2, .(date1=i.date1, date2=i.date2, var=any(date2 < i.date1)), by=.EACHI, on=.(id)]


dt3 <- copy(dat)
frankMtd <- function() dt3[, v := .SD[copy(.SD), on=.(id, date2 < date1), .N, by=.EACHI]$N > 0L]

microbenchmark::microbenchmark(
    tlmMtd(),
    csMtd(),
    frankMtd(),
    times=5L)

# Unit: milliseconds
#       expr        min         lq       mean     median         uq       max neval
# tlmMtd()   18528.9799 18652.2217 23486.4213 19116.8014 21140.5923 39993.511     5
# csMtd()     3801.2146  3943.6201  4984.6274  5341.4322  5673.6878  6163.182     5
# frankMtd()   176.4477   177.5576   191.9636   178.9564   182.0311   244.825     5
+6

, data.table. :.

library(data.table)

setDT(dat)
dat[, rownum := .I]
dat[dat[dat, on="id", rownum[i.date2 < date1]], hit := 1]
dat

#        date1      date2 id rownum hit
#1: 2014-02-11 2014-04-12  a      1  NA
#2: 2014-05-04 2014-09-22  a      2   1
#3: 2014-05-22 2014-07-04  b      3  NA

, on "id", , , , , hit.

+5

data.table, dplyr, , , , .

function(x, y)
    as.Date(x) > min(as.Date(y))

split() , Map(), , split<-(),

answer <- logical(nrow(dat))
split(answer, dat$id) <-
    Map(fun, split(dat$date1, dat$id), split(dat$date2, dat$id))

, , , . , ; fun() .

@chinsoon12 ( ),

df <- as.data.frame(dat)
mtm1 <- function(df) {
    answer <- logical(nrow(dat))
    split(answer, df$id) <-
        Map(fun, split(df$date1, df$id), split(df$date2, df$id))
    answer
}

> identical(mtm1(df), frankMtd()$v)
[1] TRUE
> microbenchmark::microbenchmark(frankMtd(), mtm(df), times=5L)
Unit: milliseconds
       expr        min        lq       mean     median         uq        max
 frankMtd() 1917.95697 1927.2548 1928.65821 1928.45893 1933.34159 1936.27878
   mtm1(df)   47.00293   47.0198   48.02849   47.10012   47.18432   51.83523
 neval cld
     5   b
     5  a 

1000 (id = sample(1000, N, replace = TRUE)),

Unit: milliseconds
       expr       min        lq      mean    median        uq      max neval
 frankMtd() 140.87859 140.88647 141.97093 141.86977 142.28619 143.9336     5
   mtm1(df)  61.82032  64.55505  64.61313  65.53642  65.53768  65.6162     5
 cld
   b
  a 

Date

mtm2 <- function(df) {
    answer <- logical(nrow(df))
    split(answer, df$id) <- Map(
        function(x, y) x > min(y),
        split(as.numeric(df$date1), df$id),
        split(as.numeric(df$date2), df$id)
    )
    answer
}

with 1e5 values ​​in 1e4 groups, with a idfactor (), and compared with the fastest frank_*()results

> identical(frank_any()$v, mtm1(df))
[1] TRUE
> identical(frank_any()$v, mtm2(df))
[1] TRUE

and

Unit: milliseconds
        expr       min        lq      mean    median        uq       max neval
 frank_any()  79.90262  80.43112  81.79228  81.18565  83.18963  84.25236     5
    mtm1(df) 237.00027 241.40299 244.83638 246.26495 249.47713 250.03658     5
    mtm2(df)  44.11074  46.17133  51.26976  47.03285  52.77204  66.26184     5
 cld
  b 
   c
 a
+4
source

All Articles