Get all links in a document

Question

Get all links in a document

Having a “regular document” in Google Docs / Drive (such as paragraphs, lists, tables) that contains external links scattered throughout the content, how do I make a list of links submitted using Google Apps Script?

In particular, I want to update all broken links in the document by searching oldText in each URL and replace it with newText in each URL, but not in the text.

I don’t think I need the text-replacing section of the Dev documentation - do I need to scan every element of the document? Can I just edit AsText and use HTML regular expression? Examples will be appreciated.

+9

google-apps-script google-docs

drzaus 10 Sep '13 at 19:32

source share

6 answers

I offer a different, shorter answer to your first question regarding enumerating all the links in the body of a document. This instructional code returns a flat array of links in the current document body, where each link is represented by an object with records pointing to a text element ( text ), a paragraph element or the list item in which it was contained ( paragraph ), an index offset in the text where the link appears ( startOffset ) and the URL itself ( url ). I hope you find it easy to pick it up for your own needs.

It uses the getTextAttributeIndices() method and does not iterate over each character of the text, and therefore it is expected to work much faster than the previously written answers.

EDIT : Since publishing this answer, I modified the function several times. Now it also (1) includes the endOffsetInclusive property for each link (note that it can be null for links that extend to the end of the text element - in this case, you can use link.text.length-1 instead); (2) finds links in all sections of the document, and not just in the body, and (3) includes section and isFirstPageSection properties in isFirstPageSection to indicate where the link is; (4) accepts the argument mergeAdjacent , which, with mergeAdjacent true, will return only one link entry for a continuous segment of text associated with the same URL (which will be considered separate if, for example, part of the text is decorated differently) than the other part) .

To include links in all sections, a new iterateSections() utility function was introduced.

 /** * Returns a flat array of links which appear in the active document body. * Each link is represented by a simple Javascript object with the following * keys: * - "section": {ContainerElement} the document section in which the link is * found. * - "isFirstPageSection": {Boolean} whether the given section is a first-page * header/footer section. * - "paragraph": {ContainerElement} contains a reference to the Paragraph * or ListItem element in which the link is found. * - "text": the Text element in which the link is found. * - "startOffset": {Number} the position (offset) in the link text begins. * - "endOffsetInclusive": the position of the last character of the link * text, or null if the link extends to the end of the text element. * - "url": the URL of the link. * * @param {boolean} mergeAdjacent Whether consecutive links which carry * different attributes (for any reason) should be returned as a single * entry. * * @returns {Array} the aforementioned flat array of links. */ function getAllLinks(mergeAdjacent) { var links = []; var doc = DocumentApp.getActiveDocument(); iterateSections(doc, function(section, sectionIndex, isFirstPageSection) { if (!("getParagraphs" in section)) { // as we're using some undocumented API, adding this to avoid cryptic // messages upon possible API changes. throw new Error("An API change has caused this script to stop " + "working.\n" + "Section #" + sectionIndex + " of type " + section.getType() + " has no .getParagraphs() method. " + "Stopping script."); } section.getParagraphs().forEach(function(par) { // skip empty paragraphs if (par.getNumChildren() == 0) { return; } // go over all text elements in paragraph / list-item for (var el=par.getChild(0); el!=null; el=el.getNextSibling()) { if (el.getType() != DocumentApp.ElementType.TEXT) { continue; } // go over all styling segments in text element var attributeIndices = el.getTextAttributeIndices(); var lastLink = null; attributeIndices.forEach(function(startOffset, i, attributeIndices) { var url = el.getLinkUrl(startOffset); if (url != null) { // we hit a link var endOffsetInclusive = (i+1 < attributeIndices.length? attributeIndices[i+1]-1 : null); // check if this and the last found link are continuous if (mergeAdjacent && lastLink != null && lastLink.url == url && lastLink.endOffsetInclusive == startOffset - 1) { // this and the previous style segment are continuous lastLink.endOffsetInclusive = endOffsetInclusive; return; } lastLink = { "section": section, "isFirstPageSection": isFirstPageSection, "paragraph": par, "textEl": el, "startOffset": startOffset, "endOffsetInclusive": endOffsetInclusive, "url": url }; links.push(lastLink); } }); } }); }); return links; } /** * Calls the given function for each section of the document (body, header, * etc.). Sections are children of the DocumentElement object. * * @param {Document} doc The Document object (such as the one obtained via * a call to DocumentApp.getActiveDocument()) with the sections to iterate * over. * @param {Function} func A callback function which will be called, for each * section, with the following arguments (in order): * - {ContainerElement} section - the section element * - {Number} sectionIndex - the child index of the section, such that * doc.getBody().getParent().getChild(sectionIndex) == section. * - {Boolean} isFirstPageSection - whether the section is a first-page * header/footer section. */ function iterateSections(doc, func) { // get the DocumentElement interface to iterate over all sections // this bit is undocumented API var docEl = doc.getBody().getParent(); var regularHeaderSectionIndex = (doc.getHeader() == null? -1 : docEl.getChildIndex(doc.getHeader())); var regularFooterSectionIndex = (doc.getFooter() == null? -1 : docEl.getChildIndex(doc.getFooter())); for (var i=0; i<docEl.getNumChildren(); ++i) { var section = docEl.getChild(i); var sectionType = section.getType(); var uniqueSectionName; var isFirstPageSection = ( i != regularHeaderSectionIndex && i != regularFooterSectionIndex && (sectionType == DocumentApp.ElementType.HEADER_SECTION || sectionType == DocumentApp.ElementType.FOOTER_SECTION)); func(section, i, isFirstPageSection); } }

+4

Yuval Nov 21 '16 at 21:52

source share

I played and included @Mogsdad answer - here is a really complicated version:

 var _ = Underscorejs.load(); // loaded via http://googleappsdeveloper.blogspot.com/2012/11/using-open-source-libraries-in-apps.html, rolled my own var ui = DocumentApp.getUi(); // #region --------------------- Utilities ----------------------------- var gDocsHelper = (function(P, un) { // heavily based on answer /questions/802562/get-all-links-in-a-document/2957651#2957651 var updatedLinkText = function(link, offset) { return function() { return 'Text: ' + link.getText().substring(offset,100) + ((link.getText().length-offset) > 100 ? '...' : ''); } } P.updateLink = function updateLink(link, oldText, newText, start, end) { var oldLink = link.getLinkUrl(start); if(0 > oldLink.indexOf(oldText)) return false; var newLink = oldLink.replace(new RegExp(oldText, 'g'), newText); link.setLinkUrl(start || 0, (end || oldLink.length), newLink); log(true, "Updating Link: ", oldLink, newLink, start, end, updatedLinkText(link, start) ); return { old: oldLink, "new": newLink, getText: updatedLinkText(link, start) }; }; // moving this reused block out to 'private' fn var updateLinkResult = function(text, oldText, newText, link, urls, sidebar, updateResult) { // and may as well update the link while we're here if(false !== (updateResult = P.updateLink(text, oldText, newText, link.start, link.end))) { sidebar.append('<li>' + updateResult['old'] + ' &rarr; ' + updateResult['new'] + ' at ' + updateResult['getText']() + '</li>'); } urls.push(link.url); // so multiple links get added to list }; P.updateLinksMenu = function() { // https://developers.google.com/apps-script/reference/base/prompt-response var oldText = ui.prompt('Old link text to replace').getResponseText(); var newText = ui.prompt('New link text to replace with').getResponseText(); log('Replacing: ' + oldText + ', ' + newText); var sidebar = gDocUiHelper.createSidebar('Update All Links', '<h3>Replacing</h3><p><code>' + oldText + '</code> &rarr; <code>' + newText + '</code></p><hr /><ol>'); // current doc available to script var doc = DocumentApp.getActiveDocument().getBody();//.getActiveSection(); // Search until a link is found var links = P.findAllElementsFor(doc, function(text) { var i = -1, n = text.getText().length, link = false, url, urls = [], updateResult; // note: the following only gets the FIRST link in the text -- while(i < n && !(url = text.getLinkUrl(i++))); // scan the text element for links while(++i < n) { // getLinkUrl will continue to get a link while INSIDE the stupid link, so only do this once if(url = text.getLinkUrl(i)) { if(false === link) { link = { start: i, end: -1, url: url }; // log(true, 'Type: ' + text.getType(), 'Link: ' + url, function() { return 'Text: ' + text.getText().substring(i,100) + ((ni) > 100 ? '...' : '')}); } else { link.end = i; // keep updating the end position until we leave } } // just left the link -- reset link tracking else if(false !== link) { // and may as well update the link while we're here updateLinkResult(text, oldText, newText, link, urls, sidebar); link = false; // reset "counter" } } // once we've reached the end of the text, must also check to see if the last thing we found was a link if(false !== link) updateLinkResult(text, oldText, newText, link, urls, sidebar); return urls; }); sidebar.append('</ol><p><strong>' + links.length + ' links reviewed</strong></p>'); gDocUiHelper.attachSidebar(sidebar); log(links); }; P.findAllElementsFor = function(el, test) { // generic utility function to recursively find all elements; heavily based on /questions/802562/get-all-links-in-a-document/2957651#2957651 var results = [], searchResult = null, i, result; // https://developers.google.com/apps-script/reference/document/body#findElement(ElementType) while (searchResult = el.findElement(DocumentApp.ElementType.TEXT, searchResult)) { var t = searchResult.getElement().editAsText(); // .asParagraph() // check to add to list if(test && (result = test(t))) { if( _.isArray(result) ) results = results.concat(result); // could be big? http://jsperf.com/self-concatenation/ else results.push(result); } } // recurse children if not plain text item if(el.getType() !== DocumentApp.ElementType.TEXT) { i = el.getNumChildren(); var result; while(--i > 0) { result = P.findAllElementsFor(el.getChild(i)); if(result && result.length > 0) results = results.concat(result); } } return results; }; return P; })({}); // really? it can't handle object properties? function gDocsUpdateLinksMenu() { gDocsHelper.updateLinksMenu(); } gDocUiHelper.addMenu('Zaus', [ ['Update links', 'gDocsUpdateLinksMenu'] ]); // #endregion --------------------- Utilities -----------------------------

And I include the “extra” utility classes for creating menus, sidebars, etc. below for completeness:

 var log = function() { // return false; var args = Array.prototype.slice.call(arguments); // allowing functions delegates execution so we can save some non-debug cycles if code left in? if(args[0] === true) Logger.log(_.map(args, function(v) { return _.isFunction(v) ? v() : v; }).join('; ')); else _.each(args, function(v) { Logger.log(_.isFunction(v) ? v() : v); }); } // #region --------------------- Menu ----------------------------- var gDocUiHelper = (function(P, un) { P.addMenuToSheet = function addMenu(spreadsheet, title, items) { var menu = ui.createMenu(title); // make sure menu items are correct format _.each(items, function(v,k) { var err = []; // provided in format [ [name, fn],... ] instead if( _.isArray(v) ) { if ( v.length === 2 ) { menu.addItem(v[0], v[1]); } else { err.push('Menu item ' + k + ' missing name or function: ' + v.join(';')) } } else { if( !v.name ) err.push('Menu item ' + k + ' lacks name'); if( !v.functionName ) err.push('Menu item ' + k + ' lacks function'); if(!err.length) menu.addItem(v.name, v.functionName); } if(err.length) { log(err); ui.alert(err.join('; ')); } }); menu.addToUi(); }; // list of things to hook into var initializers = {}; P.addMenu = function(menuTitle, menuItems) { if(initializers[menuTitle] === un) { initializers[menuTitle] = []; } initializers[menuTitle] = initializers[menuTitle].concat(menuItems); }; P.createSidebar = function(title, content, options) { var sidebar = HtmlService .createHtmlOutput() .setTitle(title) .setWidth( (options && options.width) ? width : 350 /* pixels */); sidebar.append(content); if(options && options.on) DocumentApp.getUi().showSidebar(sidebar); // else { sidebar.attach = function() { DocumentApp.getUi().showSidebar(this); }; } // should really attach to prototype... return sidebar; }; P.attachSidebar = function(sidebar) { DocumentApp.getUi().showSidebar(sidebar); }; P.onOpen = function() { var spreadsheet = SpreadsheetApp.getActive(); log(initializers); _.each(initializers, function(v,k) { P.addMenuToSheet(spreadsheet, k, v); }); }; return P; })({}); // #endregion --------------------- Menu ----------------------------- /** * A special function that runs when the spreadsheet is open, used to add a * custom menu to the spreadsheet. */ function onOpen() { gDocUiHelper.onOpen(); }

+1

drzaus Sep 13 '13 at 21:11

source share

You are right ... search and replacements are not applicable here. Use setLinkUrl () https://developers.google.com/apps-script/reference/document/container-element#setLinkUrl(String)

Basically you need to relay elements through elements (elements can contain elements) and for each use getLinkUrl () to get oldText if not null, setLinkUrl (newText) .... leaves the text unchanged

0

Davidf 10 Sep '13 at 21:53

source share

Some problems have a problem with the Mogsdad solution. In particular, it skips links that end its parent element, so there is no ending character without a link to complete it. I implemented something that accesses this and returns a standard range element. Share here if anyone finds this helpful.

 function getAllLinks(element) { var rangeBuilder = DocumentApp.getActiveDocument().newRange(); // Parse the text iteratively to find the start and end indices for each link if (element.getType() === DocumentApp.ElementType.TEXT) { var links = []; var string = element.getText(); var previousUrl = null; // The URL of the previous character var currentLink = null; // The latest link being built for (var charIndex = 0; charIndex < string.length; charIndex++) { var currentUrl = element.getLinkUrl(charIndex); // New URL means create a new link if (currentUrl !== null && previousUrl !== currentUrl) { if (currentLink !== null) links.push(currentLink); currentLink = {}; currentLink.url = String(currentUrl); currentLink.startOffset = charIndex; } // In a URL means extend the end of the current link if (currentUrl !== null) { currentLink.endOffsetInclusive = charIndex; } // Not in a URL means close and push the link if ready if (currentUrl === null) { if (currentLink !== null) links.push(currentLink); currentLink = null; } // End the loop and go again previousUrl = currentUrl; } // Handle the end case when final character is a link if (currentLink !== null) links.push(currentLink); // Convert the links into a range before returning links.forEach(function(link) { rangeBuilder.addElement(element, link.startOffset, link.endOffsetInclusive); }); } // If not a text element then recursively get links from child elements else if (element.getNumChildren) { for (var i = 0; i < element.getNumChildren(); i++) { rangeBuilder.addRange(getAllLinks(element.getChild(i))); } } return rangeBuilder.build(); }

0

fynyky Aug 05 '17 at 18:56

source share

I solved this problem with this Excel macro, which lists links from a Word document. First you will need to copy your data into a Word document.

 Sub getLinks() Dim wApp As Word.Application, wDoc As Word.Document Dim i As Integer, r As Range Const filePath = "C:\test\test.docx" Set wApp = CreateObject("Word.Application") 'wApp.Visible = True Set wDoc = wApp.Documents.Open(filePath) Set r = Range("A1") For i = 1 To wDoc.Hyperlinks.Count r = wDoc.Hyperlinks(i).Address Set r = r.Offset(1, 0) Next i wApp.Quit Set wDoc = Nothing Set wApp = Nothing End Sub

0

Tony m Apr 4 '19 at 12:40

source share

Mogsdad · Accepted Answer · 2013-09-11T02:15:17+0000

It only hurts! Code is available as part of the entity .

Screenshothot ^{_{Yes, I can’t write.}}

getAllLinks

It uses a utility function that scans a document for all LinkUrls, returning them to an array.

/** * Get an array of all LinkUrls in the document. The function is * recursive, and if no element is provided, it will default to * the active document Body element. * * @param {Element} element The document element to operate on. * . * @returns {Array} Array of objects, vis * {element, * startOffset, * endOffsetInclusive, * url} */ function getAllLinks(element) { var links = []; element = element || DocumentApp.getActiveDocument().getBody(); if (element.getType() === DocumentApp.ElementType.TEXT) { var textObj = element.editAsText(); var text = element.getText(); var inUrl = false; for (var ch=0; ch < text.length; ch++) { var url = textObj.getLinkUrl(ch); if (url != null) { if (!inUrl) { // We are now! inUrl = true; var curUrl = {}; curUrl.element = element; curUrl.url = String( url ); // grab a copy curUrl.startOffset = ch; } else { curUrl.endOffsetInclusive = ch; } } else { if (inUrl) { // Not any more, we're not. inUrl = false; links.push(curUrl); // add to links curUrl = {}; } } } } else { var numChildren = element.getNumChildren(); for (var i=0; i<numChildren; i++) { links = links.concat(getAllLinks(element.getChild(i))); } } return links; }

findAndReplaceLinks

This utility is built on getAllLinks to perform a search and replace function.

 /** * Replace all or part of UrlLinks in the document. * * @param {String} searchPattern the regex pattern to search for * @param {String} replacement the text to use as replacement * * @returns {Number} number of Urls changed */ function findAndReplaceLinks(searchPattern,replacement) { var links = getAllLinks(); var numChanged = 0; for (var l=0; l<links.length; l++) { var link = links[l]; if (link.url.match(searchPattern)) { // This link needs to be changed var newUrl = link.url.replace(searchPattern,replacement); link.element.setLinkUrl(link.startOffset, link.endOffsetInclusive, newUrl); numChanged++ } } return numChanged; }

Demo interface

To demonstrate the use of these utilities, here are a few user interface extensions:

 function onOpen() { // Add a menu with some items, some separators, and a sub-menu. DocumentApp.getUi().createMenu('Utils') .addItem('List Links', 'sidebarLinks') .addItem('Replace Link Text', 'searchReplaceLinks') .addToUi(); } function searchReplaceLinks() { var ui = DocumentApp.getUi(); var app = UiApp.createApplication() .setWidth(250) .setHeight(100) .setTitle('Change Url text'); var form = app.createFormPanel(); var flow = app.createFlowPanel(); flow.add(app.createLabel("Find: ")); flow.add(app.createTextBox().setName("searchPattern")); flow.add(app.createLabel("Replace: ")); flow.add(app.createTextBox().setName("replacement")); var handler = app.createServerHandler('myClickHandler'); flow.add(app.createSubmitButton("Submit").addClickHandler(handler)); form.add(flow); app.add(form); ui.showDialog(app); } // ClickHandler to close dialog function myClickHandler(e) { var app = UiApp.getActiveApplication(); app.close(); return app; } function doPost(e) { var numChanged = findAndReplaceLinks(e.parameter.searchPattern,e.parameter.replacement); var ui = DocumentApp.getUi(); var app = UiApp.createApplication(); sidebarLinks(); // Update list var result = DocumentApp.getUi().alert( 'Results', "Changed "+numChanged+" urls.", DocumentApp.getUi().ButtonSet.OK); } /** * Shows a custom HTML user interface in a sidebar in the Google Docs editor. */ function sidebarLinks() { var links = getAllLinks(); var sidebar = HtmlService .createHtmlOutput() .setTitle('URL Links') .setWidth(350 /* pixels */); // Display list of links, url only. for (var l=0; l<links.length; l++) { var link = links[l]; sidebar.append('<p>'+link.url); } DocumentApp.getUi().showSidebar(sidebar); }

Get all links in a document

getAllLinks

findAndReplaceLinks

Demo interface

More articles: