I get an unexpected error while using this. The first section is from a script that I found on the Internet, and I'm trying to use it to pull out a specific section specified in a PDF outline. Everything works fine except for the right output.write(outputfile1) , which says:
PdfReadError: several definitions in a dictionary.
Anyone else come across this? Please forgive all unnecessary print at the end. :)
import pyPdf import glob class Darrell(pyPdf.PdfFileReader): def getDestinationPageNumbers(self): def _setup_outline_page_ids(outline, _result=None): if _result is None: _result = {} for obj in outline: if isinstance(obj, pyPdf.pdf.Destination): _result[(id(obj), obj.title)] = obj.page.idnum elif isinstance(obj, list): _setup_outline_page_ids(obj, _result) return _result def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None): if _result is None: _result = {} if pages is None: _num_pages = [] pages = self.trailer["/Root"].getObject()["/Pages"].getObject() t = pages["/Type"] if t == "/Pages": for page in pages["/Kids"]: _result[page.idnum] = len(_num_pages) _setup_page_id_to_num(page.getObject(), _result, _num_pages) elif t == "/Page": _num_pages.append(1) return _result outline_page_ids = _setup_outline_page_ids(self.getOutlines()) page_id_to_page_numbers = _setup_page_id_to_num() result = {} for (_, title), page_idnum in outline_page_ids.iteritems(): result[title] = page_id_to_page_numbers.get(page_idnum, '???') return result for fileName in glob.glob("*.pdf"): output = pyPdf.PdfFileWriter() print fileName pdf = Darrell(open(fileName, 'rb')) template = '%-5s %s' print template % ('page', 'title') for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]): print template % (p+1,t) for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]): if t == "CATEGORY 1": startpg = p+1 print p+1,'is the first page of Category 1.' if t == "CATEGORY 2": endpg = p+1 print p+1,'is the last page of Category 1.' print startpg, endpg pagenums = range(startpg,endpg) print pagenums for i in pagenums: output.addPage(pdf.getPage(i)) fileName2 = "%sCategory1_data.pdf" % (str(fileName[:-13])) print "%s has %s pages." % (fileName2,output.getNumPages()) outputfile1 = file(r"%s" % (fileName2), 'wb') output.write(outputfile1) outputfile1.close()
source share