I want to create a scraper using Google Spreadsheets using Google Apps Script. I know that this is possible, and I saw some tutorials and topics about it.
The basic idea is to use:
var html = UrlFetchApp.fetch('http://en.wikipedia.org/wiki/Document_Object_Model').getContentText(); var doc = XmlService.parse(html);
And then get and work with the elements. However method
XmlService.parse()
Does not work on any page. For example, if I try:
function test(){ var html = UrlFetchApp.fetch("https://www.nespresso.com/br/pt/product/maquina-de-cafe-espresso-pixie-clips-preto-lima-neon-c60-220v").getContentText(); var parse = XmlService.parse(html); }
I get the following error:
Error on line 225: The entity name must immediately follow the '&' in the entity reference. (line 3, file "")
I tried using string.replace() to eliminate characters that seem to be causing an error, but that won't work. All other errors appear. The following code, for example:
function test(){ var html = UrlFetchApp.fetch("https://www.nespresso.com/br/pt/product/maquina-de-cafe-espresso-pixie-clips-preto-lima-neon-c60-220v").getContentText(); var regExp = new RegExp("&", "gi"); html = html.replace(regExp,""); var parse = XmlService.parse(html); }
Gives me the following error:
Error on line 358: The content of elements must consist of well-formed character data or markup. (line 6, file "")
I believe this is a problem with the XmlService.parse() method.
I read in these threads:
Google App Script parsing table from messed html and What is the best way to parse html in google apps script , you can use an outdated method called xml.parse() , which takes a second parameter that allows you to parse HTML. However, as I mentioned, it is out of date, and I canβt find the documentation anywhere. xml.parse() seems to parse the string, but I am having problems working with elements due to lack of documentation. And this is also not the safest long-term solution, because it may be deactivated in the near future.
So I want to know how to parse this HTML in Google Apps Script?
I also tried:
function test(){ var html = UrlFetchApp.fetch("https://www.nespresso.com/br/pt/product/maquina-de-cafe-espresso-pixie-clips-preto-lima-neon-c60-220v").getContentText(); var htmlOutput = HtmlService.createHtmlOutput(html).getContent(); var parse = XmlService.parse(htmlOutput); }
But this will not work, I get this error:
Not enough HTML content:
I was thinking about using an open source library to parse HTML, but I could not find.
My ultimate goal is to get some information from a set of pages such as "Price", "Link", "Product Name", etc. I managed to do this using the RegEx series:
var ss = SpreadsheetApp.getActiveSpreadsheet(); var linksSheet = ss.getSheetByName("Links"); var resultadosSheet = ss.getSheetByName("Resultados"); function scrapyLoco(){ var links = linksSheet.getRange(1, 1, linksSheet.getLastRow(), 1).getValues(); var arrayGrandao = []; for (var row = 0, len = links.length; row < len; row++){ var link = links[row]; var arrayDeResultados = pegarAsCoisas(link[0]); Logger.log(arrayDeResultados); arrayGrandao.push(arrayDeResultados); } resultadosSheet.getRange(2, 1, arrayGrandao.length, arrayGrandao[0].length).setValues(arrayGrandao); } function pegarAsCoisas(linkDoProduto) { var resultadoArray = []; var html = UrlFetchApp.fetch(linkDoProduto).getContentText(); var regExp = new RegExp("<h1([^]*)h1>", "gi"); var h1Html = regExp.exec(html); var h1Parse = XmlService.parse(h1Html[0]); var h1Output = h1Parse.getRootElement().getText(); h1Output = h1Output.replace(/(\r\n|\n|\r|(^( )*))/gm,""); regExp = new RegExp("Ref.: ([^(])*", "gi"); var codeHtml = regExp.exec(html); var codeOutput = codeHtml[0].replace("Ref.: ","").replace(" ",""); regExp = new RegExp("margin-top: 5px; margin-bottom: 5px; padding: 5px; background-color: #699D15; color: #fff; text-align: center;([^]*)/div>", "gi"); var descriptionHtml = regExp.exec(html); var regExp = new RegExp("<p([^]*)p>", "gi"); var descriptionHtml = regExp.exec(descriptionHtml); var regExp = new RegExp("^[^.]*", "gi"); var descriptionHtml = regExp.exec(descriptionHtml); var descriptionOutput = descriptionHtml[0].replace("<p>",""); descriptionOutput = descriptionOutput+"."; regExp = new RegExp("ecom(.+?)Main.png", "gi"); var imageHtml = regExp.exec(html); var comecoDaURL = "https://www.nespresso.com/"; var imageOutput = comecoDaURL+imageHtml[0]; var regExp = new RegExp("nes_l-float nes_big-price nes_big-price-with-out([^]*)p>", "gi"); var precoHtml = regExp.exec(html); var regExp = new RegExp("[0-9]*,", "gi"); precoHtml = regExp.exec(precoHtml); var precoOutput = "BRL "+precoHtml[0].replace(",",""); resultadoArray = [codeOutput,h1Output,descriptionOutput,"Home & Garden > Kitchen & Dining > Kitchen Appliances > Coffee Makers & Espresso Machines", "MΓ‘quina",linkDoProduto,imageOutput,"new","in stock",precoOutput,"","","","Nespresso",codeOutput]; return resultadoArray; }
But this is a lot of time for programming, it is very difficult to change it dynamically and not very reliably.
I need a way to parse this HTML code and easily access its elements. This is not really an addition. but a simple google script application ..