Having both the ordered columns "id" and "time", we could create a single group to work by creating a logical vector of indices wherever either "id" or "time" changes → 15 minutes.
WITH
id = as.character(mydf$uniqueid)
tm = mydf$time
find where "id":
id_gr = c(TRUE, id[-1] != id[-length(id)])
and time":
tm_gr = c(0, difftime(tm[-1], tm[-length(tm)], unit = "mins")) > 15
change and merge them into:
gr = id_gr | tm_gr
where either "id" or "time"> 15 is displayed. And to get the result:
tab = tabulate(cumsum(gr))
data.frame(id = id[gr], flow = seq_along(tab), n = tab)
On a larger scale:
set.seed(1821); nid = 1e4
dat = replicate(nid, as.POSIXct("2016-07-07 12:00:00 EEST") +
cumsum(sample(c(1, 5, 10, 15, 20, 30, 45, 60, 90, 120, 150, 200, 250, 300), sample(5e2:1e3, 1), TRUE)*60),
simplify = FALSE)
names(dat) = make.unique(rep_len(letters, nid))
dat = data.frame(id = rep(names(dat), lengths(dat)), time = do.call(c, dat))
system.time({
id = as.character(dat$id); tm = dat$time
id_gr = c(TRUE, id[-1] != id[-length(id)])
tm_gr = c(0, difftime(tm[-1], tm[-length(tm)], unit = "mins")) > 15
gr = id_gr | tm_gr
tab = tabulate(cumsum(gr))
ans1 = data.frame(id = id[gr], flow = seq_along(tab), n = tab)
})
# user system elapsed
# 1.44 0.19 1.66
For comparison, MikeyMike answer:
library(data.table)
dat2 = copy(dat)
system.time({
ans2 = setDT(dat2)[, list(flow = .GRP, n = .N),
by = .(id, cumsum(as.numeric(difftime(time,
shift(time, 1L, type = "lag", fill = 0),
unit = "mins")) > 15))][, cumsum := NULL]
})
identical(as.data.table(ans1), ans2)