Headless endless scroll of selenium

Question

Headless endless scroll of selenium

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.common.exceptions import StaleElementReferenceException, TimeoutException from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import urllib,requests,unidecode,lxml,pdb from pyvirtualdisplay import Display from xvfbwrapper import Xvfb class wait_for_more_than_n_elements_to_be_present(object): def __init__(self, locator, count): self.locator = locator self.count = count def __call__(self, driver): try: elements = EC._find_elements(driver, self.locator) return len(elements) > self.count except StaleElementReferenceException: return False def return_html_code(url): print url #added in edit 1 vdisplay =Xvfb() vdisplay.start() driver = webdriver.Firefox() driver.maximize_window() driver.get(url) # initial wait for the tweets to load wait = WebDriverWait(driver, 240) wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]"))) # scroll down to the last tweet until there is no more tweets loaded while True: tweets = driver.find_elements_by_css_selector("li[data-item-id]") print len(tweets) #added in edit 1 driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1]) try: wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets)) except TimeoutException: break html_full_source=driver.page_source driver.close() vdisplay.stop() html_full=return_html_code(url)

Output:

 https://twitter.com/search?q=Error%20Check&src=typd&lang=en 20 39 56 74

I have the code above for infinite page scrolling in infinite mode. But somehow it seems to stop earlier. Links - stack overflow

Change 1:

 $ phantomjs --version 2.1.1

When running @alexce code, it showed a different result in 2 runs, checking the date makes it clear that there are more tweets:

 https://twitter.com/search?q=Error%20Check&src=typd&lang=en 20 40 59 76 95 114 133 152 171 191 211 231 249 267 Date of most old tweet: 12 Jan 2016 https://twitter.com/search?q=Error%20Check&src=typd&lang=en 20 40 59 76 95 114 133 152 171 191 211 231 249 267 287 303 317 337 356 373 388 400 418 437 457 476 492 Date of most old tweet: 8 Jan 2016

Edit2:

At startup, the @alexce code version is updated. It showed an error below after ~ 7000 tweets.

  Traceback (most recent call last): File "twitter_script.py", line 82, in <module> search_twitter('Alcoholics Anonymous') File "twitter_script.py", line 76, in search_twitter db_name=write_data_to_db(*get_twitter_data(query)) File "twitter_script.py", line 24, in get_twitter_data html_full=return_html_code(url) File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 48, in return_html_code html_full_source=driver.page_source File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 464, in page_source return self.execute(Command.GET_PAGE_SOURCE)['value'] File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 199, in execute response = self.command_executor.execute(driver_command, params) File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 395, in execute return self._request(command_info[0], url, body=data) File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 463, in _request resp = opener.open(request, timeout=self._timeout) File "c:\Anaconda\lib\urllib2.py", line 431, in open response = self._open(req, data) File "c:\Anaconda\lib\urllib2.py", line 449, in _open '_open', req) File "c:\Anaconda\lib\urllib2.py", line 409, in _call_chain result = func(*args) File "c:\Anaconda\lib\urllib2.py", line 1227, in http_open return self.do_open(httplib.HTTPConnection, req) File "c:\Anaconda\lib\urllib2.py", line 1200, in do_open r = h.getresponse(buffering=True) File "c:\Anaconda\lib\httplib.py", line 1136, in getresponse response.begin() File "c:\Anaconda\lib\httplib.py", line 453, in begin version, status, reason = self._read_status() File "c:\Anaconda\lib\httplib.py", line 409, in _read_status line = self.fp.readline(_MAXLINE + 1) File "c:\Anaconda\lib\socket.py", line 480, in readline data = self._sock.recv(self._rbufsize) socket.error: [Errno 10054] An existing connection was forcibly closed by the remote host

Edit 3: Trying the same code for a different URL.

 https://twitter.com/search?q=Alcoholics%20Anonymous%20Drunk%20since%3A2006-03-24%20until%3A2006-04-23&src=typd&lang=en Traceback (most recent call last): File "twitter_script.py", line 64, in <module> search_twitter('Alcoholics Anonymous Drunk') File "twitter_script.py", line 58, in search_twitter db_name=write_data_to_db(*get_twitter_data(query)) File "twitter_script.py", line 31, in get_twitter_data html_full=return_html_code(url) File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 30, in return_html_code wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]"))) File "c:\Anaconda\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until raise TimeoutException(message, screen, stacktrace) selenium.common.exceptions.TimeoutException: Message: Screenshot: available via screen

Change 4:

 ubuntu@ip-172-31-38-123:~/social_network_extract_proxy$ cat error.txt Traceback (most recent call last): File "twitter_script.py", line 70, in <module> search_twitter('alcoholics anonymous') File "twitter_script.py", line 64, in search_twitter db_name=write_data_to_db(*get_twitter_data(query)) File "twitter_script.py", line 37, in get_twitter_data html_full=return_html_code(url) File "/home/ubuntu/social_network_extract_proxy/firefox_driver_code.py", line 35, in return_html_code driver=webdriver.Firefox(firefox_profile=profile) File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/webdriver.py", line 79, in __init__ self.binary, timeout), File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/extension_connection.py", line 49, in __init__ self.binary.launch_browser(self.profile) File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 68, in launch_browser self._wait_until_connectable() File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 106, in _wait_until_connectable % (self.profile.path)) selenium.common.exceptions.WebDriverException: Message: Can't load the profile. Profile Dir: /tmp/tmpvFoPrE If you specified a log_file in the FirefoxBinary constructor, check it for details.

Get the above error after a while.

+8

python selenium selenium-webdriver xvfb

Abhishek bhatia Jan 22 '16 at 8:26

source share

1 answer

alecxe · Accepted Answer · 2016-01-25T13:33:27+0000

Here is a set of things that made it work for me in headless mode:

go to PhantomJS
pretend to be another browser by setting the user line Agent-Agent
Before scrolling to the beginning of the last tweet, scroll to the top of the page (several times to increase reliability).

The code:

 import time def return_html_code(url): dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36" driver = webdriver.PhantomJS(desired_capabilities=dcap) driver.maximize_window() driver.get(url) # initial wait for the tweets to load wait = WebDriverWait(driver, 30) wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]"))) # scroll down to the last tweet until there is no more tweets loaded while True: tweets = driver.find_elements_by_css_selector("li[data-item-id]") number_of_tweets = len(tweets) print(number_of_tweets) # move to the top and then to the bottom 5 times in a row for _ in range(5): driver.execute_script("window.scrollTo(0, 0)") driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1]) time.sleep(0.5) try: wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets)) except TimeoutException: break

Headless endless scroll of selenium

More articles: