Since this text contains only image tags, it may be ok to use regex. But for something else, you're probably better off using bonafide's HTML parser. Fortunately, Python provides one! These are fairly bare bones - to be fully functional, this would have to handle much more angular cases. (In particular, empty XHTML-style tags (ending with a slash <... /> ) are not processed here.)
>>> from HTMLParser import HTMLParser >>> >>> class TagDropper(HTMLParser): ... def __init__(self, tags_to_drop, *args, **kwargs): ... HTMLParser.__init__(self, *args, **kwargs) ... self._text = [] ... self._tags_to_drop = set(tags_to_drop) ... def clear_text(self): ... self._text = [] ... def get_text(self): ... return ''.join(self._text) ... def handle_starttag(self, tag, attrs): ... if tag not in self._tags_to_drop: ... self._text.append(self.get_starttag_text()) ... def handle_endtag(self, tag): ... self._text.append('</{0}>'.format(tag)) ... def handle_data(self, data): ... self._text.append(data) ... >>> td = TagDropper([]) >>> td.feed('A line of text\nA line of text with an <img url="foo"> tag\nAnother line of text with a <br> tag\n') >>> print td.get_text() A line of text A line of text with an <img url="foo"> tag Another line of text with a <br> tag
And to drop the img tags ...
>>> td = TagDropper(['img']) >>> td.feed('A line of text\nA line of text with an <img url="foo"> tag\nAnother line of text with a <br> tag\n') >>> print td.get_text() A line of text A line of text with an tag Another line of text with a <br> tag
senderle
source share