This approach, which appears to be ~ 40X faster than the OP, takes advantage of lookup tables and takes advantage of extremely fast data table joins. In addition, he exploits the fact that, although there may be 1β6 combinations of date and time, there can be no more than 86400 unique times and possibly even fewer dates. Finally, he avoids using paste(...) at all.
library(data.table) library(stringr) # create a dataset with 1MM rows set.seed(1) x <- 1000*sample(1:1e5,1e6,replace=T) dt <- data.table(id=1:1e6, V1=format(as.POSIXct(x,origin="2011-01-01"),"%d/%m/%Y"), V2=format(as.POSIXct(x,origin="2011-01-01"),"%H:%M:%S"), V3=x) DT <- dt index.date <- function(dt) { # Edit: this change processes only times from the dataset; slightly more efficient V2 <- unique(dt$V2) dt.time <- data.table(char.time=V2, int.time=as.integer(substr(V2,7,8))+ 60*(as.integer(substr(V2,4,5))+ 60*as.integer(substr(V2,1,2)))) setkey(dt.time,char.time) # all dates from dataset dt.date <- data.table(char.date=unique(dt$V1), int.date=as.integer(as.POSIXct(unique(dt$V1),format="%d/%m/%Y"))) setkey(dt.date,char.date) # join the dates setkey(dt,V1) dt <- dt[dt.date] # join the times setkey(dt,V2) dt <- dt[dt.time, nomatch=0] # numerical index dt[,int.index:=int.date+int.time] # POSIX date index dt[,index:=as.POSIXct(int.index,origin='1970-01-01')] # get back original order setkey(dt,id) return(dt) } # new approach system.time(dt<-index.date(dt)) # user system elapsed # 2.26 0.00 2.26 # original approach DT <- dt system.time(DT[,`:=`(index= as.POSIXct(paste(V1,V2), format='%d/%m/%Y %H:%M:%S'), V1=NULL,V2=NULL)]) # user system elapsed # 84.33 0.06 84.52
Please note that performance depends on the number of unique dates. In the test case, there were 1200 unique dates.
EDIT the sentence to write the function in the more syntax data.table-sugar and avoid the "$" for the subset:
index.date <- function(dt,fmt="%d/%m/%Y") { dt.time <- data.table(char.time = dt[,unique(V2)],key='char.time') dt.time[,int.time :=as.integer(substr(char.time,7,8))+ 60*(as.integer(substr(char.time,4,5))+ 60*as.integer(substr(char.time,1,2)))] # all dates from dataset dt.date <- data.table(char.date = dt[,unique(V1)],key='char.date') dt.date[,int.date:=as.integer(as.POSIXct(char.date,format=fmt))] # join the dates setkey(dt,V1) dt <- dt[dt.date] # join the times setkey(dt,V2) dt <- dt[dt.time, nomatch=0] # numerical index dt[,int.index:=int.date+int.time] # POSIX date index dt[,index:=as.POSIXct.numeric(int.index,origin='1970-01-01')] # remove extra/temporary variables dt[,`:=`(int.index=NULL,int.date=NULL,int.time=NULL)] }