Does anyone have some sample code for the sqlite pipeline in Scrapy?

I am looking for sample SQLite code in Scrapy. I know that there is no built-in support for this, but I'm sure it was done. Only actual code can help me, since I know enough Python and Scrapy to perform my very limited task, and I need the code as a starting point.

+6
python sqlite export scrapy
source share
4 answers

I did something like this:

# # Author: Jay Vaughan # # Pipelines for processing items returned from a scrape. # Dont forget to add pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/topics/item-pipeline.html # from scrapy import log from pysqlite2 import dbapi2 as sqlite # This pipeline takes the Item and stuffs it into scrapedata.db class scrapeDatasqLitePipeline(object): def __init__(self): # Possible we should be doing this in spider_open instead, but okay self.connection = sqlite.connect('./scrapedata.db') self.cursor = self.connection.cursor() self.cursor.execute('CREATE TABLE IF NOT EXISTS myscrapedata ' \ '(id INTEGER PRIMARY KEY, url VARCHAR(80), desc VARCHAR(80))') # Take the item and put it in database - do not allow duplicates def process_item(self, item, spider): self.cursor.execute("select * from myscrapedata where url=?", item['url']) result = self.cursor.fetchone() if result: log.msg("Item already in database: %s" % item, level=log.DEBUG) else: self.cursor.execute( "insert into myscrapedata (url, desc) values (?, ?)", (item['url'][0], item['desc'][0]) self.connection.commit() log.msg("Item stored : " % item, level=log.DEBUG) return item def handle_error(self, e): log.err(e) 
+9
source share

Here is the sqlite pipeline with sqlalchemy. With sqlalchemy, you can easily change your database if needed.

In settings.py add database configuration

 # settings.py # ... DATABASE = { 'drivername': 'sqlite', # 'host': 'localhost', # 'port': '5432', # 'username': 'YOUR_USERNAME', # 'password': 'YOUR_PASSWORD', 'database': 'books.sqlite' } 

Then in pipelines.py add the following

 # pipelines.py import logging from scrapy import signals from sqlalchemy import Column, Integer, String, DateTime from sqlalchemy import create_engine from sqlalchemy.engine.url import URL from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker from sqlalchemy.pool import NullPool logger = logging.getLogger(__name__) DeclarativeBase = declarative_base() class Book(DeclarativeBase): __tablename__ = "books" id = Column(Integer, primary_key=True) title = Column('title', String) author = Column('author', String) publisher = Column('publisher', String) url = Column('url', String) scrape_date = Column('scrape_date', DateTime) def __repr__(self): return "<Book({})>".format(self.url) class SqlitePipeline(object): def __init__(self, settings): self.database = settings.get('DATABASE') self.sessions = {} @classmethod def from_crawler(cls, crawler): pipeline = cls(crawler.settings) crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def create_engine(self): engine = create_engine(URL(**self.database), poolclass=NullPool, connect_args = {'charset':'utf8'}) return engine def create_tables(self, engine): DeclarativeBase.metadata.create_all(engine, checkfirst=True) def create_session(self, engine): session = sessionmaker(bind=engine)() return session def spider_opened(self, spider): engine = self.create_engine() self.create_tables(engine) session = self.create_session(engine) self.sessions[spider] = session def spider_closed(self, spider): session = self.sessions.pop(spider) session.close() def process_item(self, item, spider): session = self.sessions[spider] book = Book(**item) link_exists = session.query(Book).filter_by(url=item['url']).first() is not None if link_exists: logger.info('Item {} is in db'.format(book)) return item try: session.add(book) session.commit() logger.info('Item {} stored in db'.format(book)) except: logger.info('Failed to add {} to db'.format(book)) session.rollback() raise return item 

and items.py should look like this:

 #items.py import scrapy class BookItem(scrapy.Item): title = scrapy.Field() author = scrapy.Field() publisher = scrapy.Field() scrape_date = scrapy.Field() 

You may also consider moving the class Book to items.py

+5
source share

If you feel comfortable using twisted adbapi, you can take this mysql pipeline as a starting point: http://github.com/darkrho/scrapy-googledir-mysql/blob/master/googledir/pipelines.py

And use this line in __init__ :

 self.dbpool = adbapi.ConnectionPool("sqlite3", database="/path/sqlite.db") 
+3
source share

For those trying to solve a similar problem, I just came across a good Sqlite Item Exproter for SQLite: https://github.com/RockyZ/Scrapy-sqlite-item-exporter .

After including it in your project settings, you can use it with:

 scrapy crawl <spider name> -o sqlite.db -t sqlite 

It can also be adapted for use as a Pipeline element instead of Item Exporter.

+2
source share

All Articles