Since locators return rows and column numbers instead of offsets, you need to wrap a bit to complete the line outlines - a simplified example (may have some offbyones; -) ...:
import cStringIO import re from xml import sax from xml.sax import handler relinend = re.compile(r'\n') txt = '''<foo> <tit>Bar</tit> <baz>whatever</baz> </foo>''' stm = cStringIO.StringIO(txt) class LocatingWrapper(object): def __init__(self, f): self.f = f self.linelocs = [] self.curoffs = 0 def read(self, *a): data = self.f.read(*a) linends = (m.start() for m in relinend.finditer(data)) self.linelocs.extend(x + self.curoffs for x in linends) self.curoffs += len(data) return data def where(self, loc): return self.linelocs[loc.getLineNumber() - 1] + loc.getColumnNumber() locstm = LocatingWrapper(stm) class Handler(handler.ContentHandler): def setDocumentLocator(self, loc): self.loc = loc def startElement(self, name, attrs): print '% s@ %s:%s (%s)' % (name, self.loc.getLineNumber(), self.loc.getColumnNumber(), locstm.where(self.loc)) sax.parse(locstm, Handler())
Of course, you donโt need to keep all the lineles around - to save memory, you can reset the "old" ones (below the last request), but then you need to do linelocs dict, etc.
source share