What you can do is iterate over the lines in the file and add only the lines with the correct length.
I defined the following csv test file:
1;2;3;4 1;2;3;4 1;2;3 1;2;3;4
Using read.table fails:
> read.table("test.csv", sep = ";") Error in scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings, : line 3 did not have 4 elements
Now an iterative approach:
require(plyr) no_lines = 4 correct_length = 4 file_con = file("test.csv", "r") result = ldply(1:no_lines, function(line) { dum = strsplit(readLines(file_con, n = 1), split = ";")[[1]] if(length(dum) == correct_length) { return(dum) } else { cat(sprintf("Skipped line %s\n", line)) return(NULL) } }) close(file_con) > result V1 V2 V3 V4 1 1 2 3 4 2 1 2 3 4 3 1 2 3 4
Of course, this is a trivial example, since the file is really small. Let's create a more complex example to become the benchmark.
# First file with invalid rows norow = 10e5 # number of rows no_lines = round(runif(norow, min = 3, max = 4)) no_lines[1] = correct_length file_content = ldply(no_lines, function(line) paste(1:line, collapse = ";")) writeLines(paste(file_content[[1]], sep = "\n"), "big_test.csv") # Same length with valid rows file_content = ldply(rep(4, norow), function(line) paste(1:line, collapse = ";")) writeLines(paste(file_content[[1]], sep = "\n"), "big_normal.csv")
Now for reference
# Iterative approach system.time({file_con <- file("big_test.csv", "r") result_test <- ldply(1:norow, function(line) { dum = strsplit(readLines(file_con, n = 1), split = ";")[[1]] if(length(dum) == correct_length) { return(dum) } else { # Commenting this speeds up by 30% #cat(sprintf("Skipped line %s\n", line)) return(NULL) } }) close(file_con)}) user system elapsed 20.559 0.047 20.775 # Normal read.table system.time(result_normal <- read.table("big_normal.csv", sep = ";")) user system elapsed 1.060 0.015 1.079 # read.table with fill = TRUE system.time({result_fill <- read.table("big_test.csv", sep = ";", fill=TRUE) na_rows <- complete.cases(result_fill) result_fill <- result_fill[-na_rows,]}) user system elapsed 1.161 0.033 1.203 # Specifying which type the columns are (eg character or numeric) # using the colClasses argument. system.time({result_fill <- read.table("big_test.csv", sep = ";", fill=TRUE, colClasses = rep("numeric", 4)) na_rows <- complete.cases(result_fill) result_fill <- result_fill[-na_rows,]}) user system elapsed 0.933 0.064 1.001
So, the iterative approach is rather slow, but 20 seconds for 1 million lines may be acceptable (although it depends on your definition of acceptable). Especially when you need only once, and then save it, using save for later retrieval. The solution proposed by @Paolo is almost as fast as a regular read.table call. Rows containing the wrong number of columns (thus NA ) are deleted using complete.cases . By indicating which classes the columns further improve performance, and I think this effect will be greater when the number of columns and rows becomes greater.
So in conclusion, the best option is to use read.table with fill = TRUE , specifying the column classes. An iterative approach using ldply is just a good option if you want more flexibility in choosing how to read lines, for example. read a line only if a specific value exceeds a threshold value. But probably this can be done faster by reading all the data in R and creating a subset. Only when the data is larger than your RAM, I could imagine that the iterative approach has its merits.