How to control the progress of the function?

I need to develop a 2886 * 2886 correlation matrix, the problem is that it RESULTtakes a long time to create an intermediate datatable ( ) to bind it so that I can perform the following actions while calling the last line RESULT=rbindlist(apply(COMB, 1, append))in the code below:

  • Estimate the time required to complete the apply function
  • Track his progress.
  • Possibility to pause and continue at a later time

Here is the code:

SOURCE=data.table(NAME=rep(paste0("NAME", as.character(1:2889)), each=600), VALUE=sample(c(TRUE,FALSE), 600, TRUE) )
> SOURCE
            NAME VALUE
      1:   NAME1  TRUE
      2:   NAME1  TRUE
      3:   NAME1  TRUE
      4:   NAME1  TRUE
      5:   NAME1  TRUE
     ---              
1733396: NAME999  TRUE
1733397: NAME999  TRUE
1733398: NAME999  TRUE
1733399: NAME999  TRUE
1733400: NAME999 FALSE

setkey(SOURCE,NAME)
a=SOURCE[,unique(NAME)]
COMB=data.table(expand.grid(a,a, stringsAsFactors=FALSE))
> COMB
             Var1    Var2
      1:    NAME1   NAME1
      2:   NAME10   NAME1
      3:  NAME100   NAME1
      4: NAME1000   NAME1
      5: NAME1001   NAME1
     ---                 
8346317:  NAME995 NAME999
8346318:  NAME996 NAME999
8346319:  NAME997 NAME999
8346320:  NAME998 NAME999
8346321:  NAME999 NAME999

append <- function(X) {
data.table(NAME1=X[1], VALUE1=SOURCE[X[1], VALUE], 
    NAME2=X[2], VALUE2=SOURCE[X[2], VALUE] )
}

RESULT=rbindlist(apply(COMB, 1, append))

Any idea?

Also do you know if there is a faster way to generate datatable RESULTfrom SOURCE? RESULTis an intermediate datatable for calculating correlation values ​​between VALUE1and VALUE2for each pair NAME.

With a subset, SOURCE RESULTit looks like this:

SOURCE=SOURCE[sample(1:nrow(SOURCE), 3)]
setkey(SOURCE,NAME)
a=SOURCE[,unique(NAME)]
COMB=data.table(expand.grid(a,a, stringsAsFactors=FALSE))
RESULT=rbindlist(apply(COMB, 1, append))
> RESULT
      NAME1 VALUE1    NAME2 VALUE2
1: NAME1859   TRUE NAME1859   TRUE
2:  NAME768  FALSE NAME1859   TRUE
3:  NAME795   TRUE NAME1859   TRUE
4: NAME1859   TRUE  NAME768  FALSE
5:  NAME768  FALSE  NAME768  FALSE
6:  NAME795   TRUE  NAME768  FALSE
7: NAME1859   TRUE  NAME795   TRUE
8:  NAME768  FALSE  NAME795   TRUE
9:  NAME795   TRUE  NAME795   TRUE

RESULT[,VALUE3:=(VALUE1==VALUE2)], : RESULT[, mean(VALUE3), by=c("NAME1", "NAME2")] , , , .

+12
6

pbapply (git), '* apply'.

:

library(pbapply)      

result <- rbindlist( pbapply(COMB, 1, append) )

. . , , . , , .

+17

txtProgressBar utils:

total <- 50
pb <- txtProgressBar(min = 0, max = total, style = 3)

lapply(1:total, function(i){
Sys.sleep(0.1)
setTxtProgressBar(pb, i)
})

*ply plyr family plyr

library(plyr)
laply(1:100, function(i) {Sys.sleep(0.05); i}, .progress = "text")

?create_progress_bar()

+6

:

setkey(SOURCE, NAME)

SOURCE[, CJ(NAME, NAME, unique = T)][
       , mean(SOURCE[V1, VALUE] == SOURCE[V2, VALUE]), by = .(V1, V2)]

Fwiw, all-caps - imo - .

+1

? . :

#dummy data
set.seed(1)
SOURCE = data.frame(
  NAME = sample(paste0("Name", 1:4),20, replace = TRUE),
  VALUE = sample(c(TRUE,FALSE), 20, replace = TRUE)
)

#update colnames for join
d1 <- SOURCE
colnames(d1) <- c("NAME1", "VALUE1")
d2 <- SOURCE
colnames(d2) <- c("NAME2", "VALUE2")

#cross join
merge(d1, d2, all = TRUE)
0

. txtProgressBar(), @JavK ! - .


- , . terminfo . , , , tput:

tc_left <- system2('tput','cub1',stdout=T);

reset . , Unix, ​​ terminfo; , RStudio Windows.

, txtProgressBar() ( @JavK), , reset : ! , cat('\r');, .


. , progInit(), , ( , , ), , prog(), . , prog.

progInit <- function(N,dec=3L) {
    progStart <<- Sys.time();
    progI <<- 1L;
    progN <<- N;
    progDec <<- dec;
}; ## end progInit()

prog <- function() {
    rem <- unclass(difftime(Sys.time(),progStart,units='secs'))*(progN/progI-1);
    days <- as.integer(rem/86400); rem <- rem-days*86400;
    hours <- as.integer(rem/3600); rem <- rem-hours*3600;
    minutes <- as.integer(rem/60); rem <- rem-minutes*60;
    seconds <- as.integer(rem); rem <- rem-seconds;
    millis <- as.integer(rem*1000);
    over <- paste(collapse='',rep(' ',20L));
    pct <- progI/progN*100;
    if (days!=0L) {
        msg <- sprintf(' %.*f%% %dd/%02d:%02d:%02d.%03d%s',
            progDec,pct,days,hours,minutes,seconds,millis,over);
    } else {
        msg <- sprintf(' %.*f%% %02d:%02d:%02d.%03d%s',
            progDec,pct,hours,minutes,seconds,millis,over);
    }; ## end if
    cat('\r');
    cat(msg);
    cat('\r');
    progI <<- progI+1L;
}; ## end prog()

library(data.table);
SOURCE <- data.table(NAME=rep(paste0("NAME", as.character(1:2889)), each=600), VALUE=sample(c(TRUE,FALSE), 600, TRUE) );
setkey(SOURCE,NAME);
a <- SOURCE[,unique(NAME)];
COMB <- data.table(expand.grid(a,a, stringsAsFactors=FALSE));
append <- function(X) {
    prog();
    data.table(NAME1=X[1],VALUE1=SOURCE[X[1],VALUE],NAME2=X[2],VALUE2=SOURCE[X[2],VALUE]);
}; ## end append()
##x <- COMB; progInit(nrow(x)); rbindlist(apply(x,1,append)); ## full object
x <- COMB[1:1e4,]; progInit(nrow(x)); rbindlist(apply(x,1,append)); ## ~30s

: , ( /), .

, COMB , ; , . , -, , , , . , , , , , . , , , , , , (.. ) .

0

For trendy progress indicators (not in the base / standard library) there is also : progress

pb <- progress_bar$new(
  format = "  downloading [:bar] :percent eta: :eta",
  total = 100, clear = FALSE, width= 60)
for (i in 1:100) {
  pb$tick()
  Sys.sleep(1 / 100)
}

#> downloading [========----------------------]  28% eta:  1s

Thus, this meets the requirements of (1) and (2), but not (3). To cache intermediate results, it is probably easiest to write data to disk from time to time. For quick serialization, you can try

  • fst: convenient for serializing columnar data structures such as data.tables
  • qs for more general object serialization

Hope this helps.

0
source

All Articles