Using regex may not be a problem if you are considering a different approach. For example, look for all the tags, and then check to see if the tag name matches the list of specific, valid HTML tag names:
var protos = document.body.constructor === window.HTMLBodyElement; validHTMLTags =/^(?:a|abbr|acronym|address|applet|area|article|aside|audio|b|base|basefont|bdi|bdo|bgsound|big|blink|blockquote|body|br|button|canvas|caption|center|cite|code|col|colgroup|data|datalist|dd|del|details|dfn|dir|div|dl|dt|em|embed|fieldset|figcaption|figure|font|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hgroup|hr|html|i|iframe|img|input|ins|isindex|kbd|keygen|label|legend|li|link|listing|main|map|mark|marquee|menu|menuitem|meta|meter|nav|nobr|noframes|noscript|object|ol|optgroup|option|output|p|param|plaintext|pre|progress|q|rp|rt|ruby|s|samp|script|section|select|small|source|spacer|span|strike|strong|style|sub|summary|sup|table|tbody|td|textarea|tfoot|th|thead|time|title|tr|track|tt|u|ul|var|video|wbr|xmp)$/i; function sanitize(txt) { var // This regex normalises anything between quotes normaliseQuotes = /=(["'])(?=[^\1]*[<>])[^\1]*\1/g, normaliseFn = function ($0, q, sym) { return $0.replace(/</g, '<').replace(/>/g, '>'); }, replaceInvalid = function ($0, tag, off, txt) { var // Is it a valid tag? invalidTag = protos && document.createElement(tag) instanceof HTMLUnknownElement || !validHTMLTags.test(tag), // Is the tag complete? isComplete = txt.slice(off+1).search(/^[^<]+>/) > -1; return invalidTag || !isComplete ? '<' + tag : $0; }; txt = txt.replace(normaliseQuotes, normaliseFn) .replace(/<(\w+)/g, replaceInvalid); var tmp = document.createElement("DIV"); tmp.innerHTML = txt; return "textContent" in tmp ? tmp.textContent : tmp.innerHTML; }
Working demo: http://jsfiddle.net/m9vZg/3/
This works because browsers parse '>' as text if it is not part of the match '<' opening tag. It does not suffer the same problems as when trying to parse HTML tags using a regular expression, because you are looking for only the opening delimiter and tag name, everything else does not matter.
This is also future proof : the WebIDL specification tells vendors how to implement prototypes for HTML elements, so we are trying to create an HTML element from the current matching tag. If the element is an instance of HTMLUnknownElement , we know that it is not a valid HTML tag. The validHTMLTags regular expression defines a list of HTML tags for older browsers, such as IE 6 and 7, that do not implement these prototypes.
Andy e
source share