Faster function to reduce R raster resolution

I use a raster package to reduce the resolution of large rasters using a function aggregate like this

require(raster) x <- matrix(rpois(1000000, 2),1000) a <-raster(x) plot(a) agg.fun <- function(x,...) if(sum(x)==0){ return(NA) } else { which.max(table(x)) } a1<-aggregate(a,fact=10,fun=agg.fun) plot(a1) 

The bitmaps I need to collect are much larger than 34000x34000, so I would like to know if there is a faster way to implement the agg.fun function.

+5
source share
4 answers

You can use gdalUtils::gdalwarp for this. It is less effective for me than @JosephWood fasterAgg.Fun for rasters with 1,000,000 cells, but for Joseph, a more detailed example is much faster. This requires that the raster exist on disk, so the write time of the coefficient is lower if your raster file is in memory.

Below I used the modification fasterAgg.Fun , which returns the most frequent value, and not its index in the block.

 library(raster) x <- matrix(rpois(10^8, 2), 10000) a <- raster(x) fasterAgg.Fun <- function(x,...) { myRle.Alt <- function (x1) { n1 <- length(x1) y1 <- x1[-1L] != x1[-n1] i <- c(which(y1), n1) x1[i][which.max(diff(c(0L, i)))] } if (sum(x)==0) { return(NA) } else { myRle.Alt(sort(x, method="quick")) } } system.time(a2 <- aggregate(a, fact=10, fun=fasterAgg.Fun)) ## user system elapsed ## 67.42 8.82 76.38 library(gdalUtils) writeRaster(a, f <- tempfile(fileext='.tif'), datatype='INT1U') system.time(a3 <- gdalwarp(f, f2 <- tempfile(fileext='.tif'), r='mode', multi=TRUE, tr=res(a)*10, output_Raster=TRUE)) ## user system elapsed ## 0.00 0.00 2.93 

Note that there is a slight difference in the definition of the mode when there are links: gdalwarp selects the highest value, and the functions passed to the aggregate above (through the behavior of which.max ) select the lowest (for example, see which.max(table(c(1, 1, 2, 2, 3, 4))) ).

It is also important to store raster data as integers (if applicable). If the data is stored as a float ( writeRaster by default), for example, the gdalwarp operation above takes about 14 seconds on my system. See ?dataType for available types.

+3
source

Try the following:

 fasterAgg.Fun <- function(x,...) { myRle.Alt <- function (x1) { n1 <- length(x1) y1 <- x1[-1L] != x1[-n1] i <- c(which(y1), n1) which.max(diff(c(0L, i))) } if (sum(x)==0) { return(NA) } else { myRle.Alt(sort(x, method="quick")) } } library(rbenchmark) benchmark(FasterAgg=aggregate(a, fact=10, fun=fasterAgg.Fun), AggFun=aggregate(a, fact=10, fun=agg.fun), replications=10, columns = c("test", "replications", "elapsed", "relative"), order = "relative") test replications elapsed relative 1 FasterAgg 10 12.896 1.000 2 AggFun 10 30.454 2.362 

For a larger test object, we have:

 x <- matrix(rpois(10^8,2),10000) a <- raster(x) system.time(a2 <- aggregate(a, fact=10, fun=fasterAgg.Fun)) user system elapsed 111.271 22.225 133.943 system.time(a1 <- aggregate(a, fact=10, fun=agg.fun)) user system elapsed 282.170 24.327 308.112 

If you want the actual values ​​as indicated in @digEmAll in the comments above, just change the return value in myRle.Alt from which.max(diff(c(0L, i))) to x1[i][which.max(diff(c(0L, i)))] .

+3
source

Just for fun, I also created the Rcpp function (not much faster than @JosephWood):

 ########### original function #(modified to return most frequent value instead of index) agg.fun <- function(x,...){ if(sum(x)==0){ return(NA) } else { as.integer(names(which.max(table(x)))) } } ########### @JosephWood function fasterAgg.Fun <- function(x,...) { myRle.Alt <- function (x1) { n1 <- length(x1) y1 <- x1[-1L] != x1[-n1] i <- c(which(y1), n1) x1[i][which.max(diff(c(0L, i)))] } if (sum(x)==0) { return(NA) } else { myRle.Alt(sort(x, method="quick")) } } ########### Rcpp function library(Rcpp) library(inline) aggrRcpp <- cxxfunction(signature(values='integer'), ' Rcpp::IntegerVector v(clone(values)); std::sort(v.begin(),v.end()); int n = v.size(); double sum = 0; int currentValue = 0, currentCount = 0, maxValue = 0, maxCount = 0; for(int i=0; i < n; i++) { int value = v[i]; sum += value; if(i==0 || currentValue != value){ if(currentCount > maxCount){ maxCount = currentCount; maxValue = currentValue; } currentValue = value; currentCount = 0; }else{ currentCount++; } } if(sum == 0){ return Rcpp::IntegerVector::create(NA_INTEGER); } if(currentCount > maxCount){ maxCount = currentCount; maxValue = currentValue; } return wrap( maxValue ) ; ', plugin="Rcpp", verbose=FALSE, includes='') # wrap it to support "..." argument aggrRcppW <- function(x,...)aggrRcpp(x); 

Benchmark:

 require(raster) set.seed(123) x <- matrix(rpois(10^8, 2), 10000) a <- raster(x) system.time(a1<-aggregate(a,fact=100,fun=agg.fun)) # user system elapsed # 35.13 0.44 35.87 system.time(a2<-aggregate(a,fact=100,fun=fasterAgg.Fun)) # user system elapsed # 8.20 0.34 8.59 system.time(a3<-aggregate(a,fact=100,fun=aggrRcppW)) # user system elapsed # 5.77 0.39 6.22 ########### all equal ? all(TRUE,all.equal(a1,a2),all.equal(a2,a3)) # > [1] TRUE 
+3
source

If your goal is aggregation, don't you need the max function?

 library(raster) x <- matrix(rpois(1000000, 2),1000) a <- aggregate(a,fact=10,fun=max) 
0
source

All Articles