Read binary data in R efficiently

From a text file that I read in binary data, structured as follows:

0101010100101010101010101010
1010101001010101010101010111
1111101010101010100101010101

The file has 800 lines. Each line is equally long (but it depends on the files, so it makes no sense to hard code it). I want the input data to be stored in a data frame in which each row is a row, and every two numbers are stored in different columns, for example:

col1 col2 col3 col4
0      1    0    1

I am currently doing it like this

as.matrix(read.table(text=gsub("", ' ', readLines("input"))))->g

However, this takes too much time, since there are about 70,000 0/1 in each row.

Is there a faster way to do this?

+4
source share
3 answers

I would recommend learning read_fwffrom the "readr" package. You can do something like this:

library(readr)
len <- nchar(readLines("yourfile.txt", n = 1))
read_fwf("yourfile.txt", fwf_widths(rep(1, len)))

"iotools" , :

library(iotools)
len <- nchar(readLines("yourfile.txt", n = 1))
input.file("yourfile.txt", formatter = dstrfw, 
            col_types = rep("integer", len), widths = rep(1, len))

POC:

a <- tempfile()

writeLines("0101010100101010101010101010
1010101001010101010101010111
1111101010101010100101010101", a)

len <- nchar(readLines(a, n = 1))

library(readr)
read_fwf(a, fwf_widths(rep(1, len)))
# Source: local data frame [3 x 28]
# 
#   X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28
# 1  0  1  0  1  0  1  0  1  0   0   1   0   1   0   1   0   1   0   1   0   1   0   1   0   1   0   1   0
# 2  1  0  1  0  1  0  1  0  0   1   0   1   0   1   0   1   0   1   0   1   0   1   0   1   0   1   1   1
# 3  1  1  1  1  1  0  1  0  1   0   1   0   1   0   1   0   1   0   0   1   0   1   0   1   0   1   0   1

, , read_fwf. , "iotools" awk + fread.

:

## Creates a file named "somefile.txt"
set.seed(1)
A <- replicate(10, sample(0:1, 70000, TRUE), FALSE)
A <- sapply(A, paste, collapse = "")
writeLines(rep(A, 800/length(A)), "somefile.txt")

. , , , .

, readr : -)

Freadr <- function(infile = "somefile.txt") {
  len <- nchar(readLines(infile, n = 1))
  read_fwf(infile, fwf_widths(rep(1, len)))
}
system.time(temp1 <- Freadr())
# |===============================================================| 100%   53 MB
#    user  system elapsed 
# 466.740   0.384 466.506 

Fiotools <- function(infile = "somefile.txt") {
  len <- nchar(readLines(infile, n = 1))
  input.file(infile, formatter = dstrfw, 
             col_types = rep("integer", len), widths = rep(1, len))
}
system.time(temp2 <- Fiotools())
#    user  system elapsed 
#   7.248   0.016   7.273 

Fawk <- function(infile = "somefile.txt") {
  cmd <- sprintf("awk '{gsub(/./,\"&,\", $1);print $1}' %s", infile)
  fread(cmd)
}
system.time(temp3 <- Fawk())
#    user  system elapsed 
#  12.948   0.156  13.109 

R :

fun4 <- function(infile = "somefile.txt") {
  do.call(rbind, lapply(strsplit(readLines(infile), "", TRUE), as.numeric))
}
system.time(fun4())
#    user  system elapsed 
#   9.056   0.260   9.304 

: matrix, data.frame data.table, , .

+6

pipe awk

read.table(pipe("awk '{gsub(/./,\"& \", $1);print $1}' yourfile.txt"))
#   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
#1  0  1  0  1  0  1  0  1  0   0   1   0   1   0   1   0   1   0   1   0   1
#2  1  0  1  0  1  0  1  0  0   1   0   1   0   1   0   1   0   1   0   1   0
#3  1  1  1  1  1  0  1  0  1   0   1   0   1   0   1   0   1   0   0   1   0
#  V22 V23 V24 V25 V26 V27 V28
#1   0   1   0   1   0   1   0
#2   1   0   1   0   1   1   1
#3   1   0   1   0   1   0   1

read.table(pipe("awk '{gsub(\"\",\" \", $1);print $1}' yourfile.txt"))

fread awk

library(data.table)
fread("awk '{gsub(/./,\"&,\", $1);print $1}' yourfile.txt")

OP,

library(stringi)
write.table(stri_rand_strings(800,70000, '[0-1]'), file='binary1.txt',
         row.names=FALSE, quote=FALSE, col.names=FALSE)

system.time(fread("awk '{gsub(/./,\"&,\", $1);print $1}' binary1.txt"))
#  user  system elapsed 
#16.444   0.108  16.542 
+7

, , ( ), data.frame, ( !). , .

lns = strsplit(readLines("somefile.txt"), "")

,

v = match(unlist(lns), c("0", "1")) - 1L
m = matrix(v, nrow=length(lns), byrow=TRUE)

input2matrix <- function(fname) {
    lns = strsplit(readLines("somefile.txt"), "")
    v = match(unlist(lns), c("0", "1")) - 1L
    matrix(v, nrow=length(lns), byrow=TRUE)
}

5 800 x 70000. , ( iotools, R- C-) ( R!).

+2

All Articles