How to resolve integer overflow errors in an estimate of R

I am trying to evaluate a model using speedglm in R. The dataset is large (~ 69.88 million rows and 38 columns). Multiplying the number of rows and columns results in ~ 2.7 billion, which goes beyond the integer limit. I cannot provide data, but the following examples recreate the problem.

library(speedglm)

# large example that works 
require(biglm)
n <- 500000
k <- 500
y <- rgamma(n, 1.5, 1)
x <- round(matrix(rnorm(n*k), n, k), digits = 3)
colnames(x) <- paste("s", 1:k, sep = "")
da <- data.frame(y, x)
fo <- as.formula(paste("y~", paste(paste("s", 1:k, sep = ""), collapse = "+")))   
working.example <- speedglm(fo, data = da, family = Gamma(log))

# repeat with large enough size to break 
k <- 5000       # 10 times larger than above
x <- round(matrix(rnorm(n*k), n, k), digits = 3)
colnames(x) <- paste("s", 1:k, sep = "")
da <- data.frame(y, x)
fo <- as.formula(paste("y~", paste(paste("s", 1:k, sep = ""), collapse = "+")))   
failed.example <- speedglm(fo, data = da, family = Gamma(log))

# attempting to resolve error with chunksize
attempted.fixed.example <- speedglm(fo, data = da, family = Gamma(log), chunksize = 10^6)

This causes an overflow and integer error warning.

Error in if (!replace && is.null(prob) && n > 1e+07 && size <= n/2) .Internal(sample2(n,  :  
  missing value where TRUE/FALSE needed
In addition: Warning message:
In nrow(X) * ncol(X) : NAs produced by integer overflow 

I understand the warning, but I do not understand the error. It seems that they are connected in this case, when they appear together after each attempt.

. , , ; . chunksize , .

: (1) ? (2) , ? (3) ?

,

JP.

: R 3.3.3 (2017-03-06)

:

dft_var <- c("cltvV0", "cltvV60", "cltvV120", "VCFLBRQ", "ageV0", 
             "ageV1", "ageV8", "ageV80", "FICOV300", "FICOV650", 
             "FICOV900", "SingleHouse", "Apt", "Mobile", "Duplex", 
             "Row", "Modular", "Rural", "FirstTimeBuyer", 
             "FirstTimeBuyerMissing", "brwtotinMissing", "IncomeRatio", 
             "VintageBefore2001", "NFLD", "yoy.fcpwti:province_n") 
logit1 <- speedglm(formula = paste("DefaultFlag ~ ", 
                                   paste(dft_var, collapse = "+"), 
                                   sep = ""), 
                   family = binomial(logit), 
                   na.action = na.exclude, 
                   data = default.data,
                   chunksize = 1*10^7)
+6
2

Update:

, @James , , NULL sparse speedglm, is.sparse.

, :

speedglm(fo, data = da, family = Gamma(log), sparse = FALSE)

:

is.sparse speedglm.

:

sample(X,round((nrow(X)*ncol(X)*camp),digits=0),replace=FALSE)

- nrow(X)*ncol(X) . nrow ncol integer, . .

nr = 1000000L
nc = 1000000L
nr*nc
# [1] NA
# Warning message:
# In nr * nc : NAs produced by integer overflow

, sample , X - size = NA. :

sample(matrix(1,3000,1000000), NA, replace=FALSE)
# Error in if (useHash) .Internal(sample2(n, size)) else .Internal(sample(n,  : 
# missing value where TRUE/FALSE needed
+2

@Andrey . is.sparse. , sparse=FALSE speedglm ( sparse=TRUE, .) , speedglm is.sparse speedglm.wfit :

if (is.null(sparse))
    sparse <- is.sparse(x = x, sparsellim, camp)

, sparse is.sparse.

, :

speedglm(fo, data = da, family = Gamma(log), sparse = FALSE)
+1

All Articles