Here's an unreliable inefficient recursive regex:
import re re_tag = re.compile(r'<(?P<tag>[^>]+)>(?P<content>.*?)</(?P=tag)>', re.S) def iterparse(text, tag=None): if tag is not None: yield tag, text for m in re_tag.finditer(text): for tag, text in iterparse(m.group('content'), m.group('tag')): yield tag, text def strip_tags(content): nested = lambda m: re_tag.sub(nested, m.group('content')) return re_tag.sub(nested, content) txt = "<133_3><135_3><116_2>The other system worked for about 1 month</116_2> got some good images <137_3>on it then it started doing the same thing as the first one</137_3> so then I quit using either camera now they are just sitting and collecting dust. </135_3></133_3>" d = {} for tag, text in iterparse(txt): d.setdefault(strip_tags(text), []).append(int(tag[:-2])) print(d)
Output:
{'on it then it started doing the same thing as the first one': [137], 'The other system worked for about 1 month': [116], 'The other system worked for about 1 month got some good images on it then it started doing the same thing as the first one so then I quit using \ either camera now they are just sitting and collecting dust. ': [133, 135]}
source share