R: How can I avoid the for loop in this situation?

I am trying to perform a simple intersection of genomic tracks in R and run into serious performance problems, probably related to my use of loops.

In this situation, I have predefined windows with an interval of 100 bp, and I'm trying to calculate how many of each window are covered with annotations in the list. Graphically, it looks something like this:

          0    100   200    300    400   500   600  
windows: |-----|-----|-----|-----|-----|-----|

mylist:    |-|   |-----------|

So, I wrote the code for this, but it is rather slow and has become a bottleneck in my code:

##window for each 100-bp segment    
windows <- numeric(6)

##second track
mylist = vector("list")
mylist[[1]] = c(1,20)
mylist[[2]] = c(120,320)


##do the intersection
for(i in 1:length(mylist)){
  st <- floor(mylist[[i]][1]/100)+1
  sp <- floor(mylist[[i]][2]/100)+1
  for(j in st:sp){       
    b <- max((j-1)*100, mylist[[i]][1])
    e <- min(j*100, mylist[[i]][2])
    windows[j] <- windows[j] + e - b + 1
  }
}

print(windows)
[1]  20  81 101  21   0   0

, , , . , for, * apply , .

, - C, , , . - , ?

+5
5

"" - bioconductor IRanges, IntervalTree .

IRanges , findOverlaps .

:

http://www.bioconductor.org/packages/release/bioc/html/IRanges.html

C, .

-, , ( ), , ( )... , , . , .

, .

+6

, , 100 20, . :

Reduce('+', lapply(mylist, function(x) hist(x[1]:x[2], breaks = (0:6) * 100, plot = F)$counts)) 

, breaks, , , .

+4

, 3- . - ?

:

my <- do.call(rbind,mylist)
myFloor <- floor(my/100)
myRem <- my%%100
#Add intervals, over counting interval endpoints
counts <- table(do.call(c,apply(myFloor,1,function(r) r[1]:r[2])))
windows[as.numeric(names(counts))+1] <- counts*101

#subtract off lower and upper endpoints
lowerUncovered <- tapply(myRem[,1],myFloor[,1],sum)
windows[as.numeric(names(lowerUncovered))+1]  <-  windows[as.numeric(names(lowerUncovered))+1]  - lowerUncovered
upperUncovered <- tapply(myRem[,2],myFloor[,2],function(x) 100*length(x) - sum(x))
windows[as.numeric(names(upperUncovered))+1]  <-  windows[as.numeric(names(upperUncovered))+1] - upperUncovered

:

mylist = vector("list")
for(i in 1:20000){
    d <- round(runif(1,,500))
    mylist[[i]] <- c(d,d+round(runif(1,,700)))
}

windows <- numeric(200)


new_code <-function(){
    my <- do.call(rbind,mylist)
    myFloor <- floor(my/100)
    myRem <- my%%100
    counts <- table(do.call(c,apply(myFloor,1,function(r) r[1]:r[2])))
    windows[as.numeric(names(counts))+1] <- counts*101

    lowerUncovered <- tapply(myRem[,1],myFloor[,1],sum)
    windows[as.numeric(names(lowerUncovered))+1]  <-  windows[as.numeric(names(lowerUncovered))+1]  - lowerUncovered

    upperUncovered <- tapply(myRem[,2],myFloor[,2],function(x) 100*length(x) - sum(x))
    windows[as.numeric(names(upperUncovered))+1]  <-  windows[as.numeric(names(upperUncovered))+1] - upperUncovered

    #print(windows)
}


#old code
old_code <- function(){
    for(i in 1:length(mylist)){
        st <- floor(mylist[[i]][1]/100)+1
        sp <- floor(mylist[[i]][2]/100)+1
        for(j in st:sp){       
            b <- max((j-1)*100, mylist[[i]][1])
            e <- min(j*100, mylist[[i]][2])
            windows[j] <- windows[j] + e - b + 1
        }
    }
    #print(windows)
}

system.time(old_code())
system.time(new_code())

:

> system.time(old_code())
   user  system elapsed 
  2.403   0.021   2.183 
> system.time(new_code())
   user  system elapsed 
  0.739   0.033   0.588 

, 0, . , C, 50-100X .

+4

, ... System.time .

windows <- numeric(6)

mylist = vector("list")
mylist[[1]] = c(1,20)
mylist[[2]] = c(120,320)


library(plyr)

l_ply(mylist, function(x) {
sapply((floor(x[1]/100)+1) : (floor(x[2]/100)+1), function(z){
    eval.parent(parse(text=paste("windows[",z,"] <- ", 
        min(z*100, x[2]) - max((z-1)*100, x[1]) + 1,sep="")),sys.nframe())
    })          
})

print(windows)

eval

g <- llply(mylist, function(x) {
ldply((floor(x[1]/100)+1) : (floor(x[2]/100)+1), function(z){
        t(matrix(c(z,min(z*100, x[2]) - max((z-1)*100, x[1]) + 1),nrow=2))
    })          
})

for(i in 1:length(g)){
    windows[unlist(g[[i]][1])] <- unlist(g[[i]][2])
}
+1

, . , mylist, 100 windows. , st -th sp - .

  windows <- numeric(100)
  for(i in 1:length(mylist)){ 
    win <- mylist[[i]]         # for cleaner code
    st <- floor(win[1]/100)+1 
    sp <- floor(win[2]/100)+1 
    # start and stop are within the same window
    if (sp == st){
      windows[st] <- windows[st] + (win[2]%%100) - (win[1]%%100) +1 
    }
    # start and stop are in separate windows - take care of edges
    if (sp > st){
      windows[st] <- windows[st] + 100 - (win[1]%%100) + 1
      windows[sp] <- windows[sp] + (win[2]%%100)
    }
    # windows completely inside win
    if (sp > st+1){
      windows[(st+1):(sp-1)] <- windows[(st+1):(sp-1)] + 100
    }       
  }

:

  cuts <- sort(sample(1:10000, 70))  # random interval endpoints
  mylist <- split(cuts, gl(35,2))

1,08 1000 1,72 1000 . , mylist 100 .

, , lapply mylist, .

0

All Articles