I use two pipelines, one for loading images (MyImagesPipeline) and one for saving data in mongodb (MongoPipeline).
suppose we have a lot of spiders (spider1, spider2, ...........), in my example spider1 and spider5 cannot use MyImagesPipeline
settings.py
ITEM_PIPELINES = {'scrapycrawler.pipelines.MyImagesPipeline' : 1,'scrapycrawler.pipelines.MongoPipeline' : 2} IMAGES_STORE = '/var/www/scrapycrawler/dowload'
And the full complete pipeline code
import scrapy import string import pymongo from scrapy.pipelines.images import ImagesPipeline class MyImagesPipeline(ImagesPipeline): def process_item(self, item, spider): if spider.name not in ['spider1', 'spider5']: return super(ImagesPipeline, self).process_item(item, spider) else: return item def file_path(self, request, response=None, info=None): image_name = string.split(request.url, '/')[-1] dir1 = image_name[0] dir2 = image_name[1] return dir1 + '/' + dir2 + '/' +image_name class MongoPipeline(object): collection_name = 'scrapy_items' collection_url='snapdeal_urls' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'scraping') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider):
Nanhe Kumar Jun 30 '16 at 13:33 2016-06-30 13:33
source share