The easiest way to read a CSV file mapped to memory?

When I read files in C ++ (11), I map them in memory using:

boost::interprocess::file_mapping* fm = new file_mapping(path, boost::interprocess::read_only);
boost::interprocess::mapped_region* region = new mapped_region(*fm, boost::interprocess::read_only);
char* bytes = static_cast<char*>(region->get_address());

This is normal when I want to quickly read bytes by byte. However, I created a csv file that I would like to map into memory, read each line and separate each line with a comma.

Is there a way to do this with a few modifications of my above code?

(I map to memory because I have a lot of memory and I don't want a bottleneck with the disk / I / O stream).

+3
source share
2 answers

" ". 116 MiB CSV ( 2.5Mio [1]) ~ 1 .

, ( ).

:

  • ~ 3 , wc csv.txt
  • , perl ( ):

    perl -ne '$fields{scalar split /,/}++; END { map { print "$_\n" } keys %fields  }' csv.txt
    
  • , (LANG=C wc csv.txt), ( 1,5 )

:

using CsvField = boost::string_ref;
using CsvLine  = std::vector<CsvField>;
using CsvFile  = std::vector<CsvLine>;  // keep it simple :)

struct CsvParser : qi::grammar<char const*, CsvFile()> {
    CsvParser() : CsvParser::base_type(lines)
    {
        using namespace qi;

        field = raw [*~char_(",\r\n")] 
            [ _val = construct<CsvField>(begin(_1), size(_1)) ]; // semantic action
        line  = field % ',';
        lines = line  % eol;
    }
    // declare: line, field, fields
};

( ) - CsvField .

:

int main()
{
    boost::iostreams::mapped_file_source csv("csv.txt");

    CsvFile parsed;
    if (qi::parse(csv.data(), csv.data() + csv.size(), CsvParser(), parsed))
    {
        std::cout << (csv.size() >> 20) << " MiB parsed into " << parsed.size() << " lines of CSV field values\n";
    }
}

116 MiB parsed into 2578421 lines of CSV values

, std::string:

for (int i = 0; i < 10; ++i)
{
    auto l     = rand() % parsed.size();
    auto& line = parsed[l];
    auto c     = rand() % line.size();

    std::cout << "Random field at L:" << l << "\t C:" << c << "\t" << line[c] << "\n";
}

, :

Random field at L:1979500    C:2    sateen's
Random field at L:928192     C:1    sackcloth's
Random field at L:1570275    C:4    accompanist's
Random field at L:479916     C:2    apparel's
Random field at L:767709     C:0    pinks
Random field at L:1174430    C:4    axioms
Random field at L:1209371    C:4    wants
Random field at L:2183367    C:1    Klondikes
Random field at L:2142220    C:1    Anthony
Random field at L:1680066    C:2    pines

Live On Coliru


[1] ,

while read a && read b && read c && read d && read e
do echo "$a,$b,$c,$d,$e"
done < /etc/dictionaries-common/words

csv.txt, 2,5 .

+4

istringstream , :

const std::string stringBuffer(bytes, region->get_size());
std::istringstream is(stringBuffer);
typedef boost::tokenizer< boost::escaped_list_separator<char> > Tokenizer;
std::string line;
std::vector<std::string> parsed;
while(getline(is, line))
{
    Tokenizer tokenizer(line);
    parsed.assign(tokenizer.begin(),tokenizer.end());
    for (auto &column: parsed)
    {
        // 
    }
}

, - . , , , IO, . , . , , memory_mapping, , (, ), .

+1

All Articles