, HTMLParser . , str.join @falsetru . , .
, , . handle_starttag handle_endtag.
, .
class Element:
def __init__(self, parent, tag, attrs=None):
self.parent = parent
self.tag = tag
self.children = []
self.attrs = attrs or []
self.data = ''
HTMLParser node handle_starttag handle_endtag. node , .
class MyHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.root = Element(NONE, '__DOCROOT__')
self.current = self.root
def handle_starttag(self, tag, attrs):
newel = Element(self.current tag, attrs)
self.current.children.append(newel)
self.current = newel
def handle_endtag(self, tag):
self.current = self.current.parent
def handle_data(self, data):
self.current.data += data
def handle_charref(self, ref):
self.handle_entityref('#' + ref)
def handle_entityref(self, ref):
self.handle_data(self.unescape("&%s" % ref))
MyHTMLParser.root, .
n = '<strong>I <3s U & you luvz me</strong>'
p = MyHTMLParser()
p.feed(n)
p.close()
def print_tree(node, indent=0):
print(' ' * indent + node.tag)
print(' ' * indent + ' ' + node.data)
for c in node.children:
print_tree(c, indent + 1)
print_tree(p.root)
__DOCROOT__
strong
I <3s U & you luvz me
n = <html><head><title>Test</title></head><body><h1>I <3s U & you luvz me</h1></body></html>, .
__DOCROOT__
html
head
title
Test
body
h1
I <3s U & you luvz me
, . find('tag') Element . , BeautifulSoup.