Parse the html code for the entire webpage scroll down

Question

Parse the html code for the entire webpage scroll down

from bs4 import BeautifulSoup import urllib,sys reload(sys) sys.setdefaultencoding("utf-8") r = urllib.urlopen('https://twitter.com/ndtv').read() soup = BeautifulSoup(r)

This would give me not the entire web page to scroll to the end that I want, but only a few.

EDIT:

 from selenium import webdriver from selenium.common.exceptions import StaleElementReferenceException, TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import urllib,sys,requests reload(sys) sys.setdefaultencoding("utf-8") class wait_for_more_than_n_elements_to_be_present(object): def __init__(self, locator, count): self.locator = locator self.count = count def __call__(self, driver): try: elements = EC._find_elements(driver, self.locator) return len(elements) > self.count except StaleElementReferenceException: return False def return_html_code(url): driver = webdriver.Firefox() driver.maximize_window() driver.get(url) # initial wait for the tweets to load wait = WebDriverWait(driver, 10) wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]"))) # scroll down to the last tweet until there is no more tweets loaded while True: tweets = driver.find_elements_by_css_selector("li[data-item-id]") number_of_tweets = len(tweets) print number_of_tweets driver.execute_script("arguments[0].scrollIntoView();", tweets[-1]) try: wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets)) except TimeoutException: break html_full_source=driver.page_source driver.close() return html_full_source url='https://twitter.com/thecoolstacks' #using selenium browser html_source=return_html_code(url) soup_selenium = BeautifulSoup(html_source) print soup_selenium text_tweet=[] alltweets_selenium = soup_selenium.find_all(attrs={'data-item-type' : 'tweet'}) for tweet in alltweets_selenium: #Text of tweet html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text") text_tweet.append(''.join(html_tweet[0].findAll(text=True))) print text_tweet

Estimated yield:

 import requests from bs4 import BeautifulSoup url='https://twitter.com/thecoolstacks' req = requests.get(url) soup = BeautifulSoup(req.content) alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'}) print alltweets[0]

+1

python selenium web-scraping urllib beautifulsoup

Abhishek bhatia Jun 22 '15 at 14:13

source share

1 answer

alecxe · Accepted Answer · 2015-06-25T18:47:17+0000

I would still insist on using the Twitter API .

Alternatively, here is how you can approach the problem with selenium :

use Explicit Waits and define a custom expected condition to wait for tweets to scroll download
scroll to the last downloaded tweet via scrollIntoView()

Implementation:

 from selenium import webdriver from selenium.common.exceptions import StaleElementReferenceException, TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC class wait_for_more_than_n_elements_to_be_present(object): def __init__(self, locator, count): self.locator = locator self.count = count def __call__(self, driver): try: elements = EC._find_elements(driver, self.locator) return len(elements) > self.count except StaleElementReferenceException: return False url = "https://twitter.com/ndtv" driver = webdriver.Firefox() driver.maximize_window() driver.get(url) # initial wait for the tweets to load wait = WebDriverWait(driver, 10) wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]"))) # scroll down to the last tweet until there is no more tweets loaded while True: tweets = driver.find_elements_by_css_selector("li[data-item-id]") number_of_tweets = len(tweets) driver.execute_script("arguments[0].scrollIntoView();", tweets[-1]) try: wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets)) except TimeoutException: break

This will scroll down as long as it takes to download all existing tweets in this channel.

Here is a snippet of HTML parsing tweets:

 page_source = driver.page_source driver.close() soup = BeautifulSoup(page_source) for tweet in soup.select("div.tweet div.content"): print tweet.p.text

He prints:

 Father Day Facebook post by arrested cop Suhas Gokhale son got nearly 10,000 likes http://goo.gl/aPqlxf pic.twitter.com/JUqmdWNQ3c #HWL2015 End of third quarter! Breathtaking stuff. India 2-2 Pakistan - http://sports.ndtv.com/hockey/news/244463-hockey-world-league-semifinal-india-vs-pakistan-antwerp … Why these Kashmiri boys may miss their IIT dream http://goo.gl/9LVKfK pic.twitter.com/gohX21Gibi ...

Parse the html code for the entire webpage scroll down

More articles: