Splash lua script make a few clicks and visits

I am trying to crawl Google Scholar search results and get the entire BiBTeX format of each search result. Right now I have a Scraw crawler with Splash. I have a lua script that clicks the "Cite" link and loads a modal window before getting hrefthe BibTeX format of the quote. But, seeing that there are several search results and, therefore, several "Cite" links, I need to click on all of them and load individual BibTeX pages.

Here is what I have:

import scrapy
from scrapy_splash import SplashRequest


class CiteSpider(scrapy.Spider):
    name = "cite"
    allowed_domains = ["scholar.google.com", "scholar.google.ae"]
    start_urls = [
        'https://scholar.google.ae/scholar?q="thermodynamics"&hl=en'
    ]

    script = """
        function main(splash)
          local url = splash.args.url
          assert(splash:go(url))
          assert(splash:wait(0.5))
          splash:runjs('document.querySelectorAll("a.gs_nph[aria-controls=gs_cit]")[0].click()')
          splash:wait(3)
          local href = splash:evaljs('document.querySelectorAll(".gs_citi")[0].href')
          assert(splash:go(href))
          return {
            html = splash:html(),
            png = splash:png(),
            href=href,
          }
        end
        """

    def parse(self, response):
        yield SplashRequest(self.start_urls[0], self.parse_bib,
                            endpoint="execute",
                            args={"lua_source": self.script})

    def parse_bib(self, response):
        filename = response.url.split("/")[-2] + '.html'
        with open(filename, 'wb') as f:
            f.write(response.css("body > pre::text").extract()[0])

, "Cite" lua script, querySelectorAll, . , javascript history.back(), BibTeX, , .

+4
1

, , . Lua script , :

def script(n):
    _script = """
        function main(splash)
          local url = splash.args.url
          local href = ""
          assert(splash:go(url))
          assert(splash:wait(0.5))
          splash:runjs('document.querySelectorAll("a.gs_nph[aria-controls=gs_cit]")[{}].click()')
          splash:wait(3)
          href = splash:evaljs('document.querySelectorAll("a.gs_citi")[0].href')
          assert(splash:go(href))
          return {}
        end
        """.format(n, "{html=splash:html(),png=splash:png(), href=href,}")
    return _script

parse , "Cite" . - "Cite" . Lua script ( , ) "Cite" . , , dont_filter=True:

def parse(self, response):
        n = len(response.css("a.gs_nph[aria-controls=gs_cit]").extract())
        for i in range(n):
            yield SplashRequest(response.url, self.parse_bib,
                                endpoint="execute",
                                args={"lua_source": script(i)},
                                dont_filter=True)

, .

+1

All Articles