You can just look at using spider by mikeal - it supports this use case out of the box and is designed to be cleaned.
https://github.com/mikeal/spider
Example:
var spider = require('../main'); spider() .route('www.nytimes.com', '/pages/dining/index.html', function (window, $) { $('a').spider(); }) .route('travel.nytimes.com', '*', function (window, $) { $('a').spider(); if (this.fromCache) return; var article = { title: $('nyt_headline').text(), articleBody: '', photos: [] } article.body = '' $('div.articleBody').each(function () { article.body += this.outerHTML; }) $('div#abColumn img').each(function () { var p = $(this).attr('src'); if (p.indexOf('ADS') === -1) { article.photos.push(p); } })
Also, if you intend to use node.io - I think node.io passes data as an optional parameter:
io.getHTML('someurl', function(err, junk, data){ jsdom.env({ html: data, scripts : [ 'http://code.jquery.com/jquery-1.5.min.js' ] }, function(err, window) { var $ = window.jQuery;
source share