The dataset provided by the OP does not reveal the complexity of the task. In terms of solving the OP question, so far only Mike’s answer has been correct.
In fact, within 8 days of rolling, instead of 7 calendar days, due to d <= 0 & d >= -7 .
zoo using @G. A grottendik is almost valid only if merge is done for each group of ID .
Below the second solution is data.table, this time the actual results using dev RcppRoll, which allows na.rm=TRUE .
And a slightly formatted output from Mike's solution.
data<-as.data.frame(matrix(NA,42,3)) data$V1<-seq(as.Date("2014-05-01"),as.Date("2014-09-01"),by=3) data$V2<-rep(1:6,7) data$V3<-rep(c(1,2),21) colnames(data)<-c("Date","USD","ID") library(microbenchmark) library(RcppRoll) # install_github("kevinushey/RcppRoll") library(data.table) # install_github("Rdatatable/data.table") correct_jan_dt = function(n, partial=TRUE){ DT = as.data.table(data) # this can be speedup by setDT() date.range = DT[,range(Date)] all.dates = seq.Date(date.range[1],date.range[2],by=1) setkey(DT,ID,Date) r = DT[CJ(unique(ID),all.dates)][, c("roll") := as.integer(roll_sumr(USD, n, normalize = FALSE, na.rm = TRUE)), by="ID"][!is.na(USD)] # This could be simplified when `partial` arg will be implemented in [kevinushey/RcppRoll](https://github.com/kevinushey/RcppRoll) if(isTRUE(partial)){ r[is.na(roll), roll := cumsum(USD), by="ID"][] } return(r[order(Date,ID)]) } correct_mike_dt = function(){ data = as.data.table(data)[,ID2:=.GRP,by=c("ID")] #Build reference table Ref <- data[,list(Compare_Value=list(I(USD)),Compare_Date=list(I(Date))), by=c("ID2")] #Use mapply to get last seven days of value by id data[, c("roll") := mapply(RD = Date,NUM=ID2, function(RD, NUM){ d <- as.numeric(Ref$Compare_Date[[NUM]] - RD) sum((d <= 0 & d >= -7)*Ref$Compare_Value[[NUM]])})][,ID2:=NULL][] } identical(correct_mike_dt(), correct_jan_dt(n=8,partial=TRUE)) # [1] TRUE microbenchmark(unit="relative", times=5L, correct_mike_dt(), correct_jan_dt(8)) # Unit: relative # expr min lq mean median uq max neval # correct_mike_dt() 274.0699 273.9892 267.2886 266.6009 266.2254 256.7296 5 # correct_jan_dt(8) 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 5
Looking forward to an update from @Khashaa.
Edit (20150122.2): below the benchmarks do not answer the OP question.
Dates on a larger (still very tiny) dataset, 5439 rows:
library(zoo) library(data.table) library(dplyr) library(RcppRoll) library(microbenchmark) data<-as.data.frame(matrix(NA,5439,3)) data$V1<-seq(as.Date("1970-01-01"),as.Date("2014-09-01"),by=3) data$V2<-sample(1:6,5439,TRUE) data$V3<-sample(c(1,2),5439,TRUE) colnames(data)<-c("Date","USD","ID") zoo_f = function(){ z <- read.zoo(data) z0 <- merge(z, zoo(, seq(start(z), end(z), "day")), fill = 0) # expand to daily roll <- function(x) rollsumr(x, 7, fill = NA) transform(data, roll = ave(z0$USD, z0$ID, FUN = roll)[time(z)]) } dt_f = function(){ DT = as.data.table(data) # this can be speedup by setDT() date.range = DT[,range(Date)] all.dates = seq.Date(date.range[1],date.range[2],by=1) setkey(DT,Date) DT[.(all.dates) ][order(Date), c("roll") := rowSums(setDT(shift(USD, 0:6, NA, "lag")),na.rm=FALSE), by="ID" ][!is.na(ID)] } dp_f = function(){ data %>% group_by(ID) %>% mutate(roll=roll_sum(c(rep(NA,6), USD), 7)) } dt2_f = function(){ # this can be speedup by setDT() as.data.table(data)[, c("roll") := roll_sum(c(rep(NA,6), USD), 7), by="ID"][] } identical(as.data.table(zoo_f()),dt_f()) # [1] TRUE identical(setDT(as.data.frame(dp_f())),dt_f()) # [1] TRUE identical(dt2_f(),dt_f()) # [1] TRUE microbenchmark(unit="relative", times=20L, zoo_f(), dt_f(), dp_f(), dt2_f()) # Unit: relative # expr min lq mean median uq max neval # zoo_f() 140.331889 141.891917 138.064126 139.381336 136.029019 137.730171 20 # dt_f() 14.917166 14.464199 15.210757 16.898931 16.543811 14.221987 20 # dp_f() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 # dt2_f() 1.536896 1.521983 1.500392 1.518641 1.629916 1.337903 20
But I'm not sure my data.table code is already optimal.
The above functions did not answer the OP question. Read the top of the post for updates. Mike's decision was right.