Here I take it using the generator and lxml.etree. Extracted information, for example, exclusively.
import urllib2, os, zipfile
from lxml import etree
def xmlSplitter(data,separator=lambda x: x.startswith('<?xml')):
buff = []
for line in data:
if separator(line):
if buff:
yield ''.join(buff)
buff[:] = []
buff.append(line)
yield ''.join(buff)
def first(seq,default=None):
"""Return the first item from sequence, seq or the default(None) value"""
for item in seq:
return item
return default
datasrc = "http://commondatastorage.googleapis.com/patents/grantbib/2011/ipgb20110104_wk01.zip"
filename = datasrc.split('/')[-1]
if not os.path.exists(filename):
with open(filename,'wb') as file_write:
r = urllib2.urlopen(datasrc)
file_write.write(r.read())
zf = zipfile.ZipFile(filename)
xml_file = first([ x for x in zf.namelist() if x.endswith('.xml')])
assert xml_file is not None
count = 0
for item in xmlSplitter(zf.open(xml_file)):
count += 1
if count > 10: break
doc = etree.XML(item)
docID = "-".join(doc.xpath('//publication-reference/document-id/*/text()'))
title = first(doc.xpath('//invention-title/text()'))
assignee = first(doc.xpath('//assignee/addressbook/orgname/text()'))
print "DocID: {0}\nTitle: {1}\nAssignee: {2}\n".format(docID,title,assignee)
Productivity:
DocID: US-D0629996-S1-20110104
Title: Glove backhand
Assignee: Blackhawk Industries Product Group Unlimited LLC
DocID: US-D0629997-S1-20110104
Title: Belt sleeve
Assignee: None
DocID: US-D0629998-S1-20110104
Title: Underwear
Assignee: X-Technology Swiss GmbH
DocID: US-D0629999-S1-20110104
Title: Portion of compression shorts
Assignee: Nike, Inc.
DocID: US-D0630000-S1-20110104
Title: Apparel
Assignee: None
DocID: US-D0630001-S1-20110104
Title: Hooded shirt
Assignee: None
DocID: US-D0630002-S1-20110104
Title: Hooded shirt
Assignee: None
DocID: US-D0630003-S1-20110104
Title: Hooded shirt
Assignee: None
DocID: US-D0630004-S1-20110104
Title: Headwear cap
Assignee: None
DocID: US-D0630005-S1-20110104
Title: Footwear
Assignee: Vibram S.p.A.