After these conversations:
I wanted to test a more โrealโ case study. I recently had to migrate SAS code to R code and kdb to R code. I tried to compile a simple but more sophisticated example for optimization.
create a training set
buildDF <- function(N){ set.seed(123); dateTimes <- sort(as.POSIXct("2001-01-01 08:30:00") + floor(3600*runif(N))); set.seed(124); f <- floor(1+3*runif(N)); set.seed(123); s <- floor(1+3*runif(N)); return(data.frame(dateTime=dateTimes, f=f, s=s)); }
This is what you need to do.
f1 <- function(DF){ #init N <- nrow(DF); DF$num[1] = 1; for(i in 2:N){ if(DF$f[i] == 2){ DF$num[i] <- ifelse(DF$s[i-1] == DF$s[i],DF$num[i-1],1+DF$num[i-1]); }else{ #meaning f in {1,3} if(DF$f[i-1] != 2){ DF$num[i] = DF$num[i-1]; }else{ DF$num[i] = ifelse((DF$dateTime[i]-DF$dateTime[i-1])==0,DF$num[i-1],1+DF$num[i-1]); } } } return(DF) }
It's disgusting. Let it be a little vectorized:
f2 <- function(DF){ N <- nrow(DF); DF$add <- 1; DF$ds <- c(NA,diff(DF$s)); DF$lf <- c(NA,DF$f[1:(N-1)]); DF$dt <- c(NA,diff(DF$dateTime)); DF$add[DF$f == 2 & DF$ds == 0] <- 0; DF$add[DF$f == 2 & DF$ds != 0] <- 1; DF$add[DF$f != 2 & DF$lf != 2] <- 0; DF$add[DF$f != 2 & DF$lf == 2 & DF$dt==0] <- 0; DF$num <- cumsum(DF$add); return(DF); }
And using the most useful data.table :
f3 <- function(DT){ N <- nrow(DT); DT[,add:=1]; DT[,ds:=c(NA,diff(s))]; DT[,lf:=c(NA,f[1:(N-1)])]; DT[,dt:=c(NA,diff(dateTime))]; DT[f == 2 & ds == 0, add:=0]; DT[f == 2 & ds != 0, add:=1]; DT[f != 2 & lf != 2, add:=0]; DT[f != 2 & lf == 2 & dt == 0, add:=0]; DT[,num:=cumsum(add)]; return(DT); }
In a 10K data frame:
library(rbenchmark); library(data.table); N <- 1e4; DF <- buildDF(N) DT <- as.data.table(DF);
Ok, now on more decent 5M lines of data.frame
N <- 5e6; DF <- buildDF(N) DT <- as.data.table(DF); benchmark(f2(DF),f3(DT),columns=c("test", "replications", "elapsed", + "relative", "user.self", "sys.self"), order="relative",replications=1); test replications elapsed relative user.self sys.self 2 f3(DT) 1 2.843 1.000 2.092 0.624 1 f2(DF) 1 10.920 3.841 4.016 5.137
We get 5X with data.table.
I wonder if Rcpp or zoo: rollapply can win a lot. I would be pleased with any suggestion