Selenium to navigate forms and wait for the file to complete the download

I know that this question has come up a billion times, and what I want to do is not Selen’s goal, but I don’t know anything else that can achieve this goal. I read these answers to the best of my ability and a lot of documentation, but I could use some pointers.

I am trying to download several files from CDC Compressed Mortality , which requires one to 1) click "I agree", 2) Go through a bunch of menus, check boxes and drop-down lists and 3) click "Submit" and wait for the file to start downloading automatically.

There are some very difficult web page restrictions that led me to find ways to automate this.

  • Exporting the resulting data set using the send button is incompatible with some settings that omit data points, namely, in some cases, the created file does not reflect the settings for suppressed / missing values
  • Page limits the number of rows of data

I found that when exporting data by individual states, the two points above are no longer a problem, but it is super-intensive and not very funny. I should note that I have no experience with Python (or with real programming), but the documentation seemed good enough for me to work a bit. This is what I would like to do:

  • Go to the page, click "I accept"
  • Choose state
  • Fill in some parameters
  • Click send
  • Wait for the file to finish downloading

Firefox , . , , , .part .

, 12 Florida, . Firefox , . , .

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os, unittest, time, re

basedir = os.getcwd()
savedir = os.path.join(basedir, 'download')

# Check download status
def checkdownload():
    os.chdir(savedir)
    files = filter(os.path.isfile, os.listdir(os.getcwd()))
    files = [os.path.join(os.getcwd(), f) for f in files] # add path to each file
    files.sort(key=lambda x: os.path.getmtime(x))
    if not files :
        newest_file = "no"
    else :
        newest_file = files[-1]
    os.chdir(basedir)
    return newest_file



# Set user profile
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList",2)
fp.set_preference("browser.download.manager.showWhenStarting",False)
fp.set_preference("browser.download.dir",basedir+'\\download')
fp.set_preference("browser.helperApps.neverAsk.saveToDisk","text/plain")

# Before anything downloads
previousnew = checkdownload()

# Create a new instance of the Firefox driver
b = webdriver.Firefox(firefox_profile=fp)
b.get("http://wonder.cdc.gov/cmf-icd9.html")
b.implicitly_wait(1)

### Find states
b.find_element_by_xpath("/html/body/div/form/div[3]/div/center/input").click() # Press 'I agree'

# print [o.text for o in Select(b.find_element_by_id("SD16.V1")).options]

# Make a list of all the states available
options = Select(b.find_element_by_id("codes-D16.V9")).options
optionsList = []

for option in options: 
    optionsList.append(option.get_attribute("value"))
    if option.get_attribute("value") == "*All*":
        optionsList.remove(option.get_attribute("value")) # Remove the *All* option


# Loop over states individually
for optionValue in optionsList:
    print "\nRunning on %s" % optionValue

    b.get("http://wonder.cdc.gov/cmf-icd9.html")
    b.implicitly_wait(1)

    b.find_element_by_xpath("/html/body/div/form/div[3]/div/center/input").click() # Press 'I agree'

    print "Add Selections"

    # 1. Table layout, id = SB_1 ... SB_5
    Select(b.find_element_by_id("SB_1")).select_by_visible_text("Age Group")
    Select(b.find_element_by_id("SB_2")).select_by_visible_text("Race")
    Select(b.find_element_by_id("SB_3")).select_by_visible_text("Gender")
    Select(b.find_element_by_id("SB_4")).select_by_visible_text("County")
    Select(b.find_element_by_id("SB_5")).select_by_visible_text("Year")

    # 2. Location, id = codes-D16.V9
    Select(b.find_element_by_id("codes-D16.V9")).deselect_by_index(0) # remove *All* option
    Select(b.find_element_by_id("codes-D16.V9")).select_by_value(optionValue) # selection

    # Age Group, id = SD16.V5
    Select(b.find_element_by_id("SD16.V5")).deselect_by_index(0) # remove *All* option
    Select(b.find_element_by_id("SD16.V5")).select_by_value('20-24')
    Select(b.find_element_by_id("SD16.V5")).select_by_value('25-34')
    Select(b.find_element_by_id("SD16.V5")).select_by_value('35-44')
    Select(b.find_element_by_id("SD16.V5")).select_by_value('45-54')
    Select(b.find_element_by_id("SD16.V5")).select_by_value('55-64')

    # Gender, id = SD16.V7
    # Race, id = SD16.V8
    # Hisp, Does not exist in this file

    # Year, id = SD16.V1
    yr = 1997, 1998
    Select(b.find_element_by_id("SD16.V1")).deselect_by_index(0) # remove *All* option
    select = Select(b.find_element_by_id("SD16.V1"))
    for o in yr:
        select.select_by_value("%s" % o)

    # ICD-9 Codes, id = codes-D16.V2
    # Rate per, id = SO_rate_per

    # Other options
    b.find_element_by_id("export-option").click()
    b.find_element_by_id("CO_show_totals").click()
    b.find_element_by_id("CO_show_zeros").click()
    b.find_element_by_id("CO_show_suppressed").click()

    # Submit
    print "Submit"
    b.find_element_by_xpath("/html/body/div/form/table/tbody/tr/td/div[2]/div[2]/center/input[1]").click()

    # Check if file has begun downloading
    print "Waiting for new file"
    new = checkdownload()
    while previousnew == new:
        print "... waiting"
        new = checkdownload()
        continue

    print "Waiting for download to finish"
    # New file found, wait until it doesn't have .part extension
    new = checkdownload()
    while os.path.splitext(new)[1] == ".part":
        print "... downloading"
        new = checkdownload()
        continue

    print "Downloaded"

    continue


b.quit()

, , . , ?

PS. , , " ". , script .

+4
1

. - '\\ ' , , Windows.

, - , . :

os.chdir(savedir)
files = filter(os.path.isfile, os.listdir(os.getcwd()))
files = [os.path.join(os.getcwd(), f) for f in files] # add path to each file
files.sort(key=lambda x: os.path.getmtime(x))

, , , "" ( .part, ). listdir getmtime, getmtime , , script Firefox ( "" ). , , , .

, , try/catch, , , . , , .

:

def checkdownload():
    max_mtime = 0
    newest_file = ""
    for filename in filter(os.path.isfile, os.listdir(savedir)):
        path = os.path.join(savedir, filename)
        try:
            mtime = os.path.getmtime(path)
            if mtime > max_mtime:
                newest_file = path
                max_mtime = mtime
        except OSError:
            pass  # File probably just moved/deleted
    return newest_file
  • chdir , , , , , .

  • ,

0

All Articles