So, I'm trying to open websites on new tabs inside my WebDriver. I want to do this because opening a new WebDriver for each website takes about 3.5 seconds using PhantomJS, I want more speed ...
I am using a multiprocessor python script and I want to get some elements from each page, so the workflow looks like this:
Open Browser Loop throught my array For element in array -> Open website in new tab -> do my business -> close it
But I can’t find a way to achieve this.
Here is the code I'm using. This happens forever between websites, I need it to be fast ... Other tools are allowed, but I don’t know too many tools for recycling the contents of a website loaded with JavaScript (divs generated when an event occurs at boot, etc.). why I need Selenium ... BeautifulSoup cannot be used for some of my pages.
#!/usr/bin/env python import multiprocessing, time, pika, json, traceback, logging, sys, os, itertools, urllib, urllib2, cStringIO, mysql.connector, shutil, hashlib, socket, urllib2, re from selenium import webdriver from selenium.webdriver.common.keys import Keys from PIL import Image from os import listdir from os.path import isfile, join from bs4 import BeautifulSoup from pprint import pprint def getPhantomData(parameters): try: # We create WebDriver browser = webdriver.Firefox() # Navigate to URL browser.get(parameters['target_url']) # Find all links by Selector links = browser.find_elements_by_css_selector(parameters['selector']) result = [] for link in links: # Extract link attribute and append to our list result.append(link.get_attribute(parameters['attribute'])) browser.close() browser.quit() return json.dumps({'data': result}) except Exception, err: browser.close() browser.quit() print err def callback(ch, method, properties, body): parameters = json.loads(body) message = getPhantomData(parameters) if message['data']: ch.basic_ack(delivery_tag=method.delivery_tag) else: ch.basic_reject(delivery_tag=method.delivery_tag, requeue=True) def consume(): credentials = pika.PlainCredentials('invitado', 'invitado') rabbit = pika.ConnectionParameters('localhost',5672,'/',credentials) connection = pika.BlockingConnection(rabbit) channel = connection.channel() # Conectamos al canal channel.queue_declare(queue='com.stuff.images', durable=True) channel.basic_consume(callback,queue='com.stuff.images') print ' [*] Waiting for messages. To exit press CTRL^C' try: channel.start_consuming() except KeyboardInterrupt: pass workers = 5 pool = multiprocessing.Pool(processes=workers) for i in xrange(0, workers): pool.apply_async(consume) try: while True: continue except KeyboardInterrupt: print ' [*] Exiting...' pool.terminate() pool.join()