I am working on a project to parse multiple xml files at the same time in python using lxml. When I initialize the process, I want my main class to do some work on XML before it passes the etree object to the process, but I find that when the etree object enters the new process, the class survives, but the XML has left the inside of the object and getroot () returns None.
I know that I can only transmit received data using a queue, but is this also the case with what I pass to the process inside the "args" field?
Here is my code:
import multiprocessing, multiprocessing.pool, time
from lxml import etree
def compute(tree):
print("Start Process")
print(type(tree))
print(id(tree))
print(tree.getroot())
def pool_init(queue):
compute.queue = queue
class Main():
def __init__(self):
pass
def main(self):
tree = etree.parse('test.xml')
print(id(tree))
print(tree.getroot())
self.queue = multiprocessing.Queue()
self.pool = multiprocessing.Pool(processes=1, initializer=pool_init, initargs=(self.queue,))
self.pool.apply_async(func=compute, args=(tree,))
time.sleep(10)
if __name__ == '__main__':
Main().main()
Any and all help to value highly.
UPDATE / ANSWER
, String IO.
etree.tostring , , etree.
import multiprocessing, multiprocessing.pool, time, copyreg
from lxml import etree
def compute(tree):
print("Start Process")
print(type(tree))
print(tree.getroot())
def pool_init(queue):
compute.queue = queue
def elementtree_unpickler(data):
return etree.parse(BytesIO(data))
def elementtree_pickler(tree):
return elementtree_unpickler, (etree.tostring(tree),)
copyreg.pickle(etree._ElementTree, elementtree_pickler, elementtree_unpickler)
class Main():
def __init__(self):
pass
def main(self):
tree = etree.parse('test.xml')
print(tree.getroot())
self.queue = multiprocessing.Queue()
self.pool = multiprocessing.Pool(processes=1, initializer=pool_init, initargs=(self.queue,))
self.pool.apply_async(func=compute, args=(tree,))
time.sleep(10)
if __name__ == '__main__':
Main().main()
2
, , . , , , etree . async XML, , , . XML .