The next NodeJS asynchronous queue is too fast (slowing down the asynchronous queue queue)

Question

The next NodeJS asynchronous queue is too fast (slowing down the asynchronous queue queue)

I have an HTTP Get request and I want to parse the response and save it in my database.

If I call crawl (i) independently, I get good results. But I need to call crawl () from 1 to 2000. I get good results, but some answers seem to be lost, and some answers are duplicated. I don’t think I understand how to call thousands of asynchronous functions. I am using the async module function, but so far I still lack some data and still have some duplicates. What am I doing wrong here? Thank you for your help.

What i scan

My node functions:

function getOptions(i) { return { host: 'magicseaweed.com', path: '/syndicate/rss/index.php?id='+i+'&unit=uk', method: 'GET' } }; function crawl(i){ var req = http.request(getOptions(i), function(res) { res.on('data', function (body) { parseLocation(body); }); }); req.end(); } function parseLocation(body){ parser.parseString(body, function(err, result) { if(result && typeof result.rss != 'undefined') { var locationTitle = result.rss.channel[0].title; var locationString = result.rss.channel[0].item[0].link[0]; var location = new Location({ id: locationString.split('/')[2], name: locationTitle }); location.save(); } }); } N = 2 //# of simultaneous tasks var q = async.queue(function (task, callback) { crawl(task.url); callback(); }, N); q.drain = function() { console.log('Crawling done.'); } for(var i = 0; i < 100; i++){ q.push({url: 'http://magicseaweed.com/syndicate/rss/index.php?id='+i+'&unit=uk'}); }

[EDIT] GOOD, after much testing, it seems that the service I’m looking at cannot handle so many requests that it’s fast. Because when I execute each request sequentially, I can get all the good answers.

Is there a way to slow down the ASYNC queue method?

+4

node.js asynchronous loops web-crawler

William fortin Jan 16 '13 at 21:21

source share

5 answers

 var q = async.queue(function (task, callback) { crawl(task.url); callback(); }, N);

You perform the next task immediately after starting the previous one, so the queue is simply meaningless. You should change your code as follows:

 // first, modify your 'crawl' function to take a callback argument, and call this callback after the job is done. // then var q = async.queue(function (task, next/* name this argument as 'next' is more meaningful */) { crawl(task.url, function () { // after this one is done, start next one. next(); }); // or, more simple way, crawl(task.url, next); }, N);

+9

Aaron wang Apr 16 '13 at 9:14

source share

Another option if you want. Vanilla JS without fantastic libraries.

 var incrementer = 0; var resultsArray = []; var myInterval = setInterval(function() { incrementer++ if(incrementer == 100){ clearInterval(myInterval) //when done parse results array } //make request here //push request result to array here }, 500);

Invokes a function every half second. An easy way to force synchronize and exit after x requests.

+2

Lockless Feb 17 '14 at 19:49

source share

I know I'm a little afraid of the question, but here is the solution I wrote to slow down the number of requests when testing the api endpoint using node 4 or node 5:

 var fs = require('fs'); var supertest = require('supertest'); var request = supertest("http://sometesturl.com/api/test/v1/") var Helper = require('./check.helper'); var basicAuth = Helper.basicAuth; var options = Helper.options; fs.readFile('test.txt', function(err, data){ var parsedItems = JSON.parse(data); var urlparts = [] // create a queue for (let year of range(1975, 2016)) { for (var make in parsedItems[year]){ console.log(year, make, '/models/' + year + '/' + make) urlparts.push({urlpart:'/models/' + year + '/' + make, year: year, make: make}) } } // start dequeue waitDequeue(); // This function calls itself after the makeRequest promise completes function waitDequeue(){ var item = urlparts.pop() if (item){ makeRequest(item) .then(function(){ // wait this time before next dequeue setTimeout(function() { waitDequeue(); }, 3000); }) } else { write(parsedItems) } } // make a request, mutate parsedItems then resolve function makeRequest(item){ return new Promise((resolve, reject)=>{ request .get(item.urlpart) .set(options.auth[0], options.auth[1]) .set(options.type[0], options.type[1]) .end(function(err, res) { if (err) return done1(err); console.log(res.body) res.body.forEach(function(model){ parsedItems[item.year][item.make][model] = {} }); resolve() }) }) } // write the results back to the file function write(parsedItems){ fs.writeFile('test.txt', JSON.stringify(parsedItems, null, 4), function(err){ console.log(err) }) } })

0

jmunsch Nov 25 '15 at 1:49

source share

A bit late, but I found it! With async, you can slow down the queue by using the task handler inside, for example:

 var q = async.priorityQueue(function(task, callback) { // your code process here for each task //when ready to complete the task delay it by calling async.whilst( //wait 6 seconds function() { return count < 10; }, function(callback) { count++; setTimeout(function() { callback(null, count); }, 1000); }, function (err, n) { // n seconds have passed callback(); //callback to q handler } ); //whilst } , 5);

0

Codefriendly Mar 10 '17 at 10:06

source share

Mustafa · Accepted Answer · 2013-01-16T21:35:34+0000

You should take a look at this great async module, which simplifies such async tasks. You can use a queue, a simple example:

 N = # of simultaneous tasks var q = async.queue(function (task, callback) { somehttprequestfunction(task.url, function(){ callback(); } }, N); q.drain = function() { console.log('all items have been processed'); } for(var i = 0; i < 2000; i++){ q.push({url:"http://somewebsite.com/"+i+"/feed/"}); }

It will have a window of current actions, and the task room will be available for a future task if you call only the callback function. The difference is that your code now immediately opens 2000 connections and, obviously, a high failure rate. Limiting it to a reasonable value, 5,10,20 (depending on the site and connection) will lead to better success. If the request fails, you can always try it again or redirect the task to the next regular asynchronous queue for another process. The key point is to call the callback () function in the queue function, so that the room will be available when it is done.

The next NodeJS asynchronous queue is too fast (slowing down the asynchronous queue queue)

More articles: