script, <instrumentConfiguration/> (967MB ) 40 ( ), .
24MB/s. cElementTree page (2005) 47MB/s.
from itertools import imap, islice, izip
from operator import itemgetter
from xml.etree import cElementTree as etree
def parsexml(filename):
it = imap(itemgetter(1),
iter(etree.iterparse(filename, events=('start',))))
root = next(it)
for elem in it:
if elem.tag == '{http://psi.hupo.org/ms/mzml}instrumentConfiguration':
values = [('Id', elem.get('id')),
('Parameter1', next(it).get('name'))]
componentList_count = int(next(it).get('count'))
for parent, child in islice(izip(it, it), componentList_count):
key = parent.tag.partition('}')[2]
value = child.get('name')
assert child.tag.endswith('cvParam')
values.append((key, value))
yield values
root.clear()
def print_values(it):
for line in (': '.join(val) for conf in it for val in conf):
print(line)
print_values(parsexml(filename))
$ /usr/bin/time python parse_mxml.py
Id: QTOF
Parameter1: Q-Tof ultima
source: nanoelectrospray
analyzer: quadrupole
analyzer: time-of-flight
detector: microchannel plate detector
38.51user 1.16system 0:40.09elapsed 98%CPU (0avgtext+0avgdata 23360maxresident)k
1984784inputs+0outputs (2major+1634minor)pagefaults 0swaps
. , , <instrumentConfiguration/> <cvParam/> <componentList/>, .
.
ElementTree 1.3 ~ 6 , cElementTree 1.0.6.
root.clear() elem.clear(), ~ 10% , 10 . lxml.etree elem.clear(), , cElementTree, 20 (root.clear())/2 (elem.clear()) (500 ).