Python: unescape special characters without data sharing

Question

Python: unescape special characters without data sharing

I made a simple HTML parser, which is basically a direct copy of the documents. I'm having trouble canceling special characters without dividing the data into several pieces.

Here is my code with a simple example:

from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.data = []

    def handle_starttag(self, tag, attrs):
        #print (tag,attrs)
        pass

    def handle_endtag(self, tag):
        #print (tag)
        pass

    def handle_data(self, data):
        self.data.append(data)

    def handle_charref(self, ref):
        self.handle_entityref("#" + ref)

    def handle_entityref(self, ref):
        self.handle_data(self.unescape("&%s;" % ref))



n = "<strong>I &lt;3s U &amp; you luvz me</strong>"


parser = MyHTMLParser()
parser.feed(n)
parser.close()
data = parser.data
print(data)

The problem is that it returns 5 separate bits of data

['I ', u'<', '3s U ', u'&', ' you luvz me']

Where what I want is the only line:

['I <3s U & you luvz me']

Thanks in JP

+4

python html special-characters

jprockbelly Jan 2 '14 at 3:53

source share

3 answers

, HTMLParser . , str.join @falsetru . , .

, , . handle_starttag handle_endtag.

, .

class Element:
    def __init__(self, parent, tag, attrs=None):
        self.parent = parent
        self.tag = tag
        self.children = []
        self.attrs = attrs or []
        self.data = ''

HTMLParser node handle_starttag handle_endtag. node , .

class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.root = Element(NONE, '__DOCROOT__') # Special root node for us
        self.current = self.root

    def handle_starttag(self, tag, attrs):
        newel = Element(self.current tag, attrs)
        self.current.children.append(newel)
        self.current = newel

    def handle_endtag(self, tag):
        self.current = self.current.parent

    def handle_data(self, data):
        self.current.data += data

    def handle_charref(self, ref): # No changes here
        self.handle_entityref('#' + ref)

    def handle_entityref(self, ref): # No changes here either
        self.handle_data(self.unescape("&%s" % ref))

MyHTMLParser.root, .

n = '<strong>I &lt;3s U &amp; you luvz me</strong>'
p = MyHTMLParser()
p.feed(n)
p.close()

def print_tree(node, indent=0):
    print('    ' * indent + node.tag)
    print('    ' * indent + '  ' + node.data)
    for c in node.children:
        print_tree(c, indent + 1)

print_tree(p.root)

__DOCROOT__

    strong
      I <3s U & you luvz me

n = <html><head><title>Test</title></head><body><h1>I <3s U & you luvz me</h1></body></html>, .

__DOCROOT__

    html

        head

            title
              Test
        body

            h1
              I <3s U & you luvz me

, . find('tag') Element . , BeautifulSoup.

+1

kalhartt 02 . '14 7:57

.

html_to_text, .

from HTMLParser import HTMLParser
n = "<strong>I &lt;3s U &amp; you luvz me</strong>"

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def handle_entityref(self, name):
        self.fed.append('&%s;' % name)
    def get_data(self):
        return ''.join(self.fed)

def html_to_text(html):
    s = MLStripper()
    s.feed(html)
    return HTMLParser().unescape(s.get_data())

print html_to_text(n)

:

I <3s U & you luvz me

+1

Puffin GDI 02 . '14 8:21

falsetru · Accepted Answer · 2014-01-02T03:58:05+0000

Attach a list of strings using str.join:

>>> ''.join(['I ', u'<', '3s U ', u'&', ' you luvz me'])
u'I <3s U & you luvz me'

Alternatively, you can use external libraries, such as lxml:

>>> import lxml.html
>>> n = "<strong>I &lt;3s U &amp; you luvz me</strong>"
>>> root = lxml.html.fromstring(n)
>>> root.text_content()
'I <3s U & you luvz me'

Python: unescape special characters without data sharing

More articles: