Cannot pass lxml etree object to individual process

I am working on a project to parse multiple xml files at the same time in python using lxml. When I initialize the process, I want my main class to do some work on XML before it passes the etree object to the process, but I find that when the etree object enters the new process, the class survives, but the XML has left the inside of the object and getroot () returns None.

I know that I can only transmit received data using a queue, but is this also the case with what I pass to the process inside the "args" field?

Here is my code:

import multiprocessing, multiprocessing.pool, time
from lxml import etree

def compute(tree):
    print("Start Process")
    print(type(tree)) # Returns <class 'lxml.etree._ElementTree'>
    print(id(tree)) # Returns new ID 44637320 as expected
    print(tree.getroot()) # Returns None

def pool_init(queue):
    # see http://stackoverflow.com/a/3843313/852994
    compute.queue = queue

class Main():
    def __init__(self):
        pass

    def main(self):
        tree = etree.parse('test.xml')
        print(id(tree)) # Returns object ID 43998536
        print(tree.getroot()) #Returns <Element SymCLI_ML at 0x29f5dc8>

        self.queue = multiprocessing.Queue()
        self.pool = multiprocessing.Pool(processes=1, initializer=pool_init, initargs=(self.queue,))
        self.pool.apply_async(func=compute, args=(tree,))
        time.sleep(10)

if __name__ == '__main__':
    Main().main()

Any and all help to value highly.

UPDATE / ANSWER

, String IO. etree.tostring , , etree.

import multiprocessing, multiprocessing.pool, time, copyreg
from lxml import etree

def compute(tree):
    print("Start Process")
    print(type(tree)) # Returns <class 'lxml.etree._ElementTree'>
    print(tree.getroot()) # Returns <Element SymCLI_ML at 0x29f5dc8>. Success!

def pool_init(queue):
    # see http://stackoverflow.com/a/3843313/852994
    compute.queue = queue

def elementtree_unpickler(data):
    return etree.parse(BytesIO(data))

def elementtree_pickler(tree):
    return elementtree_unpickler, (etree.tostring(tree),)

copyreg.pickle(etree._ElementTree, elementtree_pickler, elementtree_unpickler)

class Main():
    def __init__(self):
        pass

    def main(self):
        tree = etree.parse('test.xml')
        print(tree.getroot()) #Returns <Element SymCLI_ML at 0x29f5dc8>

        self.queue = multiprocessing.Queue()
        self.pool = multiprocessing.Pool(processes=1, initializer=pool_init, initargs=(self.queue,))
        self.pool.apply_async(func=compute, args=(tree,))
        time.sleep(10)

if __name__ == '__main__':
    Main().main()

2

, , . , , , etree . async XML, , , . XML .

+4
1

picklers/unpicklers lxml Element/ElementTree. lxml zmq.

import copy_reg
try:
    from cStringIO import StringIO
except ImportError:
    from StringIO import StringIO
from lxml import etree

def element_unpickler(data):
    return etree.fromstring(data)

def element_pickler(element):
    data = etree.tostring(element)
    return element_unpickler, (data,)

copy_reg.pickle(etree._Element, element_pickler, element_unpickler)

def elementtree_unpickler(data):
    data = StringIO(data)
    return etree.parse(data)

def elementtree_pickler(tree):
    data = StringIO()
    tree.write(data)
    return elementtree_unpickler, (data.getvalue(),)

copy_reg.pickle(etree._ElementTree, elementtree_pickler, elementtree_unpickler)
+5

All Articles