Data Subframe Performance

Question

Data Subframe Performance

I have several large data frames (1 million + rows x 6-10 columns). I need a subset repeatedly. The subset section is the slowest part of my code, and I'm curious if there is a way to do this faster.

load("https://dl.dropbox.com/u/4131944/Temp/DF_IOSTAT_ALL.rda") start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M") end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M") system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,]) > system.time(DF_IOSTAT_INT <- DF_IOSTAT_ALL[DF_IOSTAT_ALL$date_stamp >= start_in & DF_IOSTAT_ALL$date_stamp <= end_in,]) user system elapsed 16.59 0.00 16.60 dput(head(DF_IOSTAT_ALL)) structure(list(date_stamp = structure(list(sec = c(14, 24, 34, 44, 54, 4), min = c(0L, 0L, 0L, 0L, 0L, 1L), hour = c(0L, 0L, 0L, 0L, 0L, 0L), mday = c(20L, 20L, 20L, 20L, 20L, 20L), mon = c(7L, 7L, 7L, 7L, 7L, 7L), year = c(112L, 112L, 112L, 112L, 112L, 112L ), wday = c(1L, 1L, 1L, 1L, 1L, 1L), yday = c(232L, 232L, 232L, 232L, 232L, 232L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst" ), class = c("POSIXlt", "POSIXt")), cpu = c(0.9, 0.2, 0.2, 0.1, 0.2, 0.1), rsec_s = c(0, 0, 0, 0, 0, 0), wsec_s = c(0, 3.8, 0, 0.4, 0.2, 0.2), util_pct = c(0, 0.1, 0, 0, 0, 0), node = c("bda101", "bda101", "bda101", "bda101", "bda101", "bda101")), .Names = c("date_stamp", "cpu", "rsec_s", "wsec_s", "util_pct", "node"), row.names = c(NA, 6L), class = "data.frame")

+6

performance r dataframe subset

Tyler muth 18 sept. '12 at 14:37

source share

2 answers

Joshua ulrich · Answer 1 · 2012-09-18T15:36:16+0000

I would use xts for this. The only potential hiccup is that xts is a matrix with an ordered index attribute, so you cannot mix types like in the data.frame file.

If the node column is invariant, you can simply exclude it from the xts object:

 library(xts) x <- xts(DF_IOSTAT_ALL[,2:5], as.POSIXct(DF_IOSTAT_ALL$date_stamp)) x["2012-08-20 00:00:24/2012-08-20 00:00:54"]

Update using actual OP data:

 Data <- DF_IOSTAT_ALL # change node from character to numeric, # so it can exist in the xts object too. Data$node <- as.numeric(gsub("^bda","",Data$node) # create the xts object x <- xts(Data[,-1], as.POSIXct(Data$date_stamp)) # subset one day system.time(x['2012-08-20 13:00/2012-08-20 17:00']) # user system elapsed # 0 0 0 # subset 13:00-17:00 for all days system.time(x['T13:00/T17:00']) # user system elapsed # 2.64 0.00 2.66

Blue magister · Answer 2 · 2012-09-19T11:38:20+0000

Here are my experiments with data.table . Interestingly, only converting to data.table will make your search queries faster, perhaps due to a more efficient search for logical vectors. I compared four things: the initial search for data frames; search with conversion from POSIXlt to POSIXct (thanks to Matthew Dole); data table search; and searching for a data table in addition to setting up copy and conversion. Even with advanced configuration, the data lookup table wins. Thanks to multiple searches, you get even greater time savings.

 library(data.table) library(rbenchmark) load("DF_IOSTAT_ALL.rda") DF_IOSTAT_ALL.original <- DF_IOSTAT_ALL start_in <- strptime("2012-08-20 13:00", "%Y-%m-%d %H:%M") end_in<- strptime("2012-08-20 17:00", "%Y-%m-%d %H:%M") #function to test: original fun <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.original[DF_IOSTAT_ALL.original$date_stamp >= start_in & DF_IOSTAT_ALL.original$date_stamp <= end_in,] #function to test: changing to POSIXct DF_IOSTAT_ALL.ct <- within(DF_IOSTAT_ALL.original,date_stamp <- as.POSIXct(date_stamp)) fun.ct <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.ct[with(DF_IOSTAT_ALL.ct,date_stamp >= start_in & date_stamp <= end_in),] #function to test: with data.table and POSIXct DF_IOSTAT_ALL.dt <- as.data.table(DF_IOSTAT_ALL.ct); fun.dt <- function() DF_IOSTAT_INT <<- DF_IOSTAT_ALL.dt[date_stamp >= start_in & date_stamp <= end_in,] #function to test: with data table and POSIXct, with setup steps newfun <- function() { DF_IOSTAT_ALL <- DF_IOSTAT_ALL.original; #data.table doesn't play well with POSIXlt, so convert to POSIXct DF_IOSTAT_ALL$date_stamp <- as.POSIXct(DF_IOSTAT_ALL$date_stamp); DF_IOSTAT_ALL <- data.table(DF_IOSTAT_ALL); DF_IOSTAT_INT <<- DF_IOSTAT_ALL[date_stamp >= start_in & date_stamp <= end_in,]; } benchmark(fun(), fun.ct(), fun.dt(), newfun(), replications=3,order="relative") # test replications elapsed relative user.self sys.self user.child sys.child #3 fun.dt() 3 0.18 1.000000 0.11 0.08 NA NA #2 fun.ct() 3 0.52 2.888889 0.44 0.08 NA NA #4 newfun() 3 35.49 197.166667 34.88 0.58 NA NA #1 fun() 3 66.68 370.444444 66.42 0.15 NA NA

If you know what your time intervals are, you can do it even faster by dividing findInterval or cut and specifying / indexing the table.

 DF_IOSTAT_ALL <- copy(DF_IOSTAT_ALL.new) time.breaks <- strptime.d("2012-08-19 19:00:00") + 0:178 * 60 * 60 #by hour DF_IOSTAT_ALL[,interval := findInterval(date_stamp,time.breaks)] setkey(DF_IOSTAT_ALL,interval) start_in <- time.breaks[60] end_in <- time.breaks[61] benchmark(a <- DF_IOSTAT_ALL[J(60)],b <- fun2(DF_IOSTAT_ALL)) # test replications elapsed relative user.self sys.self user.child sys.child #1 DF_IOSTAT_ALL[J(60)] 100 0.78 1.000000 0.64 0.14 NA NA #2 fun2(DF_IOSTAT_ALL) 100 6.69 8.576923 5.76 0.91 NA NA all.equal(a,b[,.SD,.SDcols=c(12,1:11,13)]) #test for equality (rearranging columns to match) #TRUE

Data Subframe Performance

More articles: