Parsing a large json log file in node.js

I have the following JSON file:

sensorlogs.json {"arr":[{"UTCTime":10000001,"s1":22,"s2":32,"s3":42,"s4":12}, {"UTCTime":10000002,"s1":23,"s2":33,"s4":13}, {"UTCTime":10000003,"s1":24,"s2":34,"s3":43,"s4":14}, {"UTCTime":10000005,"s1":26,"s2":36,"s3":44,"s4":16}, {"UTCTime":10000006,"s1":27,"s2":37,"s4":17}, {"UTCTime":10000004,"s1":25,"s2":35,"s4":15}, ... {"UTCTime":12345678,"s1":57,"s2":35,"s3":77,"s4":99} ]} 

Sensors s1, s2, s3, etc. all transmit at different frequencies (note that s3 transmits every 2 seconds, and timestanps may be out of order).

How can i achieve something like

 Analyzing s1: s = [[10000001, 22], [10000002, 23],.. [12345678,57]] s1 had 2 missing entries Analyzing s2: s = [[10000001, 32], [10000002, 33],.. [12345678,35]] s2 had 0 missing entries Analyzing s3: s = [[10000001, 42], [10000003, 43],.. [12345678,77]] s3 had 0 missing entries Analyzing s4: s = [[10000001, 12], [10000003, 13],.. [12345678,99]] s4 had 1 missing entries 

sensorlogs.json - 16 GB.

Missing entries can be found based on the difference in successive UTC timestamps. Each sensor is transmitted at a known frequency.

I cannot use several large arrays for my analysis due to memory limitations, so I will have to make several passes over the same JSON log file and use only one large array for analysis.

What I still have is

 var result = []; //1. Extract all the keys from the log file console.log("Extracting keys... \n"); var stream = fs.createReadStream(filePath); var lineReader = lr.createInterface( { input: stream }); lineReader.on('line', function (line) { getKeys(line);//extract all the keys from the JSON }); stream.on('end', function() { //obj -> arr for(var key in tmpObj) arrStrm.push(key); //2. Validate individual sensors console.log("Validating the sensor data ...\n"); //Synchronous execution of the sensors in the array async.each(arrStrm, function(key) { { currSensor = key; console.log("validating " + currSensor + "...\n"); stream = fs.createReadStream(filePath); lineReader = lr.createInterface( { input: stream }); lineReader.on('line', function (line) { processLine(line);//Create the arrays for the sensors }); stream.on('end', function() { processSensor(currSensor);//Process the data for the current sensor }); } }); }); function getKeys(line) { if(((pos = line.indexOf('[')) >= 0)||((pos = line.indexOf(']')) >= 0)) return; if (line[line.length-1] == '\r') line=line.substr(0,line.length-1); // discard CR (0x0D) if (line[line.length-1] == ',') line=line.substr(0,line.length-1); // discard , // console.log(line); if (line.length > 1) { // ignore empty lines var obj = JSON.parse(line); // parse the JSON for(var key in obj) { if(key != "debug") { if(tmpObj[key] == undefined) tmpObj[key]=[]; } }; } } 

Of course, this does not work, and I can not find anything on the network that explains how this can be implemented.

Note: I can choose any language of my choice for developing this tool (C / C ++, C # / Java / Python), but I turn to JavaScript because of its ability to easily parse JSON arrays (and my interest in improving JS also). Does someone like to suggest an alternative language for this if JavaScript is not the best language to make such a tool?

Edit: Some important information, which is either not very clear, or I did not include earlier, but it seems that it is important to include in the question -

  • Data in JSON logs is not broadcast in real time, its saved JSON file on the hard disk
  • Data is not stored in chronological order, which means that timestamps may not be in the correct order. Therefore, the data of each sensor must be sorted based on the timestamps after , which were stored in the array
  • I cannot use separate arrays for each sensor (which will be the same as for storing only 16 GB of JSON in RAM), and to save memory, only one array should be used at a time. And yes, there are more than 4 sensors in my journal, this is just a sample (about 20 to give an idea).

I changed my JSON and expected result

One solution could be to create several passes over the JSON file, store the sensor data with timestamps in the array at a time, then sort the array, and then finally analyze the data for corruption and spaces. And here is what I am trying to do in my code above

+7
json javascript
source share
2 answers

So you have such a big, thick 16GB sensorlog wrapped in json.

To get started, the entire 16GB json file is not realistic, simply because the opening and closing brackets break the regularity and turn into just annoying characters in the array. We know that a file has a beginning and an end, and, in addition, without them, your program can work on pieces of the file or even on a stream directly connected to the device. Therefore, let's say that we will process:

 {"UTCTime":10000001,"s1":22,"s2":32,"s3":42,"s4":12}, {"UTCTime":10000002,"s1":23,"s2":33,"s4":13}, {"UTCTime":10000003,"s1":24,"s2":34,"s3":43,"s4":14}, ... {"UTCTime":12345678,"s1":57,"s2":35,"s3":77,"s4":99}, 

and even adding or finding the missing comma at the end should not be too complicated.

Now each line is formatted the same and can be interpreted as json. The problem is this: do the sensors output data if they are expected? If we are sure that they are speaking at the right time and at the right frequency (case 1), but sometimes they may miss a letter, all is well. However, if they begin to slide slightly along the time frame (case 2), then some heuristic is required to restore the correct linear frequency, and the analysis will be longer.

If we are not processing this real time, the first and simplest verification check for the file should determine if each freq line has the expected sensor data, right?

In any case, since this is a very large file, it should be processed in turn, if possible.

In the next program, I examined only case 1 and that we could handle a continuous stream.

 #!/usr/bin/python import json sensors={} sensors['s1']=[1] # frequencies sensors['s2']=[1] sensors['s3']=[2] sensors['s4']=[1] # append data array and error counter at sensors[i] # it holds [freq,err,data] for k,v in sensors.iteritems(): sensors[k].extend([0,[]]) FRQ=0;ERR=1;DAT=2 print list(sorted(sensors.items())) S=list(sorted(sensors.keys())) with open('./sensors.json', "r") as stream: i=0 for line in stream: if not line.rstrip(): continue # skip blank lines j=json.loads(line[:-2]) # skip comma and \n t=j["UTCTime"] for k in S: sensor=sensors[k] if i%sensor[FRQ]==0 : # every Nth iteration v=j.get(k) if v is None: sensor[ERR]+=1 print k,"has",sensor[ERR],"missing entries" sensor[DAT].append([t,v]) # append that sensor data # filling up the memory... i+=1 for k,v in sorted(sensors.iteritems()): print k,sensors[k][DAT] for k,v in sorted(sensors.iteritems()): print k,'had',sensors[k][ERR],"missing entries" 

To handle register 2, we must invert the None check with the module check, check that the sensor wrote something when it was not intended, and then try to detect the shifts.

Last note: your program may receive a short circuit to memory, so it might not be a good idea to keep all the data in memory. If it is intended to use separate arrays for each sensor for further processing, it would be more reasonable to write them to files.

+3
source share

Edited again to review your changes:

 var fs = require('fs'); var stream = fs.createReadStream('sensorlogs.json', {flags: 'r', encoding: 'utf-8'}); var buffer = ''; var sensor = process.argv[2]; var readings = []; var missingCont = 0; console.log('Analizying ' + sensor + ':'); stream.on('data', function(d) { buffer += d.toString(); processBuffer(); console.log(readings); console.log(sensor + ' had ' + missingCont + ' missing entries'); }); function processBuffer() { buffer = buffer.slice(buffer.indexOf('[{')); while(buffer.indexOf('{') != -1) { buffer = buffer.slice(buffer.indexOf('{"')); processLine(buffer.slice(0, buffer.indexOf('}') + 1)); buffer = buffer.slice(buffer.indexOf('}') + 2); } }; function processLine(line) { if(line != ""){ var obj = JSON.parse(line); if(!obj[sensor]){ missingCont++; }else{ var pos; for(pos = 0; pos < readings.length; pos++){ if(obj.UTCTime < readings[pos][0]){ var reading = [obj.UTCTime, obj[sensor]] readings.splice(pos, 0, reading); break; } } if(pos == readings.length){ readings.push([obj.UTCTime, obj[sensor]]); } } } }; 

You will have to call it the parameter you want to analyze:

 node.exe scripts\processJson.js <param> 

To test this, I took this sample:

 {"arr":[{"UTCTime":10000001,"s1":22,"s2":32,"s3":42,"s4":12}, {"UTCTime":10000005,"s1":20,"s2":30,"s3":40,"s4":10}, {"UTCTime":10000002,"s1":23,"s2":33,"s4":13}, {"UTCTime":10000003,"s1":24,"s2":34,"s3":43,"s4":14}, {"UTCTime":12345678,"s1":57,"s2":35,"s3":77,"s4":99} ]} 

And the result was:

 > node.exe scripts\processJson.js s1 Analizying s1: [[10000001, 22], [10000002, 23], [10000003, 24], [10000005, 20], [12345678, 57]] s1 had 0 missing entries > node.exe scripts\processJson.js s2 Analizying s2: [[10000001, 32], [10000002, 33], [10000003, 34], [10000005, 30], [12345678, 35]] s2 had 0 missing entries > node.exe scripts\processJson.js s3 Analizying s3: [[10000001, 42], [10000003, 43], [10000005, 40], [12345678, 77]] s3 had 1 missing entries > node.exe scripts\processJson.js s4 Analizying s4: [[10000001, 12], [10000002, 13], [10000003, 14], [10000005, 10], [12345678, 99]] s4 had 0 missing entries 
+2
source share

All Articles