I am scraping the e-commerce site with selenium because the pages are loading by Javascipt.
Here the workflow is: 1. Instantly launch the web diver driver in virtual display mode when sending a random user agent. Using a random user agent reduces the likelihood of detection by just a bit. This will not reduce the likelihood of IP blocking. 2. For each query term, say pajamas - create a search URL for this website - and open it. 3. Get the appropriate text elements from Xpath, say, the top 10 products, their prices, product name, etc. 4. Save them in a file - what I will process
I have over 38,000 of these URLs that I need to load elements on the page. I multiprocessed, and I quickly realized that the process did not work, because after a while the site was blocked, so the page did not load.
How can I trick the IP code in Python and will it work with selenium managing the network for you, not urllib / urlopen?
Besides setting the actual fetch via xpaths, here is the base code - more precisely, see init_driver
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import argparse
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import codecs, urllib, os
import multiprocessing as mp
from my_custom_path import scraping_conf_updated as sf
from fake_useragent import UserAgent
def set_cookies(COOKIES, exp, driver):
for key, val in COOKIES[exp].items():
driver.add_cookie({'name': key, 'value': val, 'path': '/', 'secure': False, 'expiry': None})
return driver
def check_cookies(driver, exp):
print "printing cookie name & value"
for cookie in driver.get_cookies():
if cookie['name'] in COOKIES[exp].keys():
print cookie['name'], "-->", cookie['value']
def wait_for(driver):
if conf_key['WAIT_FOR_ID'] != '':
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, conf_key['WAIT_FOR_ID'])))
elif conf_key['WAIT_FOR_CLASS'] != '':
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS, conf_key['WAIT_FOR_CLASS'])))
return driver
def init_driver(base_url, url, exp):
display = Display(visible=0, size=(1024, 768))
display.start()
profile = webdriver.FirefoxProfile()
ua = UserAgent(cache=False)
profile.set_preference("general.useragent.override",ua.random)
driver=webdriver.Firefox(profile)
if len(conf_key['COOKIES'][exp]) != 0:
driver.get(base_url)
driver.delete_all_cookies()
driver = set_cookies(COOKIES, exp, driver)
check_cookies(driver, exp)
driver.get(url)
driver.set_page_load_timeout(300)
if len(conf_key['POP_UP']['XPATH']) > 0:
driver = identify_and_close_popup(driver)
driver = wait_for(driver)
return driver