Fast subset in R

I have a dataframe of size 30,000 x 50. I also have a separate list that contains points for grouping the rows from this data frame, for example,

rows <- list(c("34", "36", "39"), c("45", "46"))

This suggests that the string data lines with the names of the growths (and not the numeric indexes of the strings, but the symbolic names (dat)) "34", "36", "39" make up one group, and "45", "46" make up another group.

Now I want to pull the groupings from the data framework into a parallel list, but my code (below) is really very slow. How can I speed it up?

> system.time(lapply(rows, function(r) {dat[r, ]}))
   user  system elapsed 
 246.09    0.01  247.23 

This is on a very fast computer, R 2.14.1 x64.

+5
source share
5 answers

- [.data.frame , , , , match. , fmatch fastmatch, . :

# naive
> system.time(res1 <- lapply(rows,function(r) dat[r,]))
   user  system elapsed 
 69.207   5.545  74.787 

# match
> rn <- rownames(dat)
> system.time(res1 <- lapply(rows,function(r) dat[match(r,rn),]))
   user  system elapsed 
 36.810  10.003  47.082 

# fastmatch
> rn <- rownames(dat)
> system.time(res1 <- lapply(rows,function(r) dat[fmatch(r,rn),]))
   user  system elapsed 
 19.145   3.012  22.226 

, [ ( ), ( split), rows (, , ​​).

, , .

+17

Update

:

rownames colnames , / , . , , "36", , "34".

, R, -, - . .

, , .

, .

"" data.frame, split data.frame , . , data.frame :

dat <- data.frame(a=sample(100, 10),
                  b=rnorm(10),
                  group=sample(c('a', 'b', 'c'), 10, replace=TRUE))

:

split(dat, dat$group)
$a
   a           b group
2 66 -0.08721261     a
9 62 -1.34114792     a

$b
    a          b group
1  32  0.9719442     b
5  79 -1.0204179     b
6  83 -1.7645829     b
7  73  0.4261097     b
10 44 -0.1160913     b

$c
   a          b group
3 77  0.2313654     c
4 74 -0.8637770     c
8 29  1.0046095     c

, , "", data.frame data.table group:

library(data.table)
dat <- data.table(dat, key="group")

, , split

 x <- lapply(unique(dat$group), function(g) dat[J(g),])

, , " ", inline, :

ans <- dat[, {
  ## do some code over the data in each split
  ## and return a list of results, eg:
  list(nrow=length(a), mean.a=mean(a), mean.b=mean(b))
}, by="group"]

ans
     group nrow mean.a     mean.b
[1,]     a    2   64.0 -0.7141803
[2,]     b    5   62.2 -0.3006076
[3,]     c    3   60.0  0.1240660

" " plyr, :

library(plyr)
ddply(dat, "group", summarize, nrow=length(a), mean.a=mean(a),
      mean.b=mean(b))
  group nrow mean.a     mean.b
1     a    2   64.0 -0.7141803
2     b    5   62.2 -0.3006076
3     c    3   60.0  0.1240660

, , , data.table.

+5

- , , , rowname rownumber dat.

, , :

> dat <- data.frame(matrix(runif(30000*50),ncol=50))
> rownames(dat) <- as.character(sample.int(nrow(dat)))
> rownames(dat)[1:5]
[1] "21889" "3050"  "22570" "28140" "9576" 

rows 15000 , 50 1 30000 ( *):

# 15000 groups of up to 50 rows each
> rows <- sapply(1:15000, function(i) as.character(sample.int(30000,size=sample.int(50,size=1))))

(ouch!):

# method 1
> system.time((res1 <- lapply(rows,function(r) dat[r,])))
   user  system elapsed 
182.306   0.877 188.362 

. map[i] i.

FIRST, 1:nrow(dat), ! , , :

> map <- sort(as.numeric(rownames(dat)), index.return=T)$ix
# NOTE: map[ as.numeric(rowname) ] -> rownumber into dat for that rowname.

:

> system.time((res2 <- lapply(rows,function(r) dat[map[as.numeric(r)],])))
   user  system elapsed
 32.424   0.060  33.050

, ( , , R):

> all(rownames(res1)==rownames(res2))
[1] TRUE

, ~ 6x. , ...

SECOND , nrow(dat), , , , max(as.numeric(rownames(dat))) , nrow(dat). map map[rowname], , , map , :

map <- rep(-1,max(as.numeric(rownames(dat))))
obj <- sort(as.numeric(rownames(dat)), index.return=T)
map[obj$x] <- obj$ix

Then use mapas before ( dat[map[as.numeric(r),]]).

+4
source

You can try this modification:

system.time(lapply(rows, function(r) {dat[ rownames(dat) %in% r, ]}))
+2
source

I agree with math coffee that I also get quick times for this.

I don’t know if this is possible, but without listing it as a vector and then converting it to a numerical number, you can get speed acceleration.

dat <- data.frame(matrix(rnorm(30000*50), 30000, 50 ))
rows <- as.numeric(unlist(list(c("34", "36", "39"), c("45", "46"))))
system.time(lapply(rows, function(r) {dat[r, ]}))

EDIT:

dat$observ <- rownames(dat)
rownames(dat) <- 1:nrow(dat)
+1
source

All Articles