How can I start writing Unit test in web Scrapy using python?

class AljazeeraSpider(XMLFeedSpider): name = "aljazeera" allowed_domains = ["aljazeera.com"] start_urls = [ 'http://www.aljazeera.com/', ] def parse(self, response): hxs = HtmlXPathSelector(response) # The xPath selector titles = hxs.select('//div[contains(@class,"SkyScrapperBoxes")]/div[contains(@class,"skyscLines")]') if not titles: MailNotify().send_mail("Aljazeera", "Scraper Report") items = [] for titles in titles: item = NewsItem() item['title'] = escape(''.join(titles.select('a/text()').extract())) item['link'] = "http://www.aljazeera.com" + escape(''.join(titles.select('a/@href').extract())) item['description'] = '' item = Request(item['link'], meta={'item': item}, callback=self.parse_detail) items.append(item) return items def parse_detail(self, response): item = response.meta['item'] sel = HtmlXPathSelector(response) detail = sel.select('//td[@class = "DetailedSummary"]') item['details'] = remove_html_tags(escape(''.join(detail.select('p').extract()))) item['location'] = '' published_date = sel.select('//span[@id = "ctl00_cphBody_lblDate"]') item['published_date'] = escape(''.join(published_date.select('text()').extract())) return item 

I am currently working on Scrapy to crawl a website. I have some information about unittest in python. But, how can I write unittest to verify that the link is working, and item['location'] , item['details'] return a value or not? I learned the Scrapy contract, but I can’t understand anything. So how can you write unittest in this case?

+8
python unit-testing web-scraping scrapy scrapy-spider
source share
1 answer

If we are talking specifically about how to test spiders (rather than pipelines or loaders), then what we have done is provided with a "fake answer" from the local HTML file. Code example:

 import os from scrapy.http import Request, TextResponse def fake_response(file_name=None, url=None): """Create a Scrapy fake HTTP response from a HTML file""" if not url: url = 'http://www.example.com' request = Request(url=url) if file_name: if not file_name[0] == '/': responses_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(responses_dir, file_name) else: file_path = file_name file_content = open(file_path, 'r').read() else: file_content = '' response = TextResponse(url=url, request=request, body=file_content, encoding='utf-8') return response 

Then, in the TestCase class, call the fake_response() function and respond to the parse() callback:

 from unittest.case import TestCase class MyTestCase(TestCase): def setUp(self): self.spider = MySpider() def test_parse(self): response = fake_response('input.html') item = self.spider.parse(response) self.assertEqual(item['title'], 'My Title') # ... 

In addition, you should start using Item Loaders with input and output processors - this will help to achieve better modularity and, therefore, isolation - the spider will simply give out instance instances, data collection and modification will be encapsulated inside the bootloader, which you will test separately.

+12
source share

All Articles