How can I handle a tarfile with a Python multiprocessing pool?

I am trying to process the contents of tarfile with multiprocessing.Pool. I can successfully use the ThreadPool implementation in a multiprocessing module, but would like to be able to use processes instead of threads, as this could be faster and eliminate some of the changes made to Matplotlib to handle a multi-threaded environment. I get an error message that I suspect is due to processes not using the address space, but I'm not sure how to fix it:

Traceback (most recent call last):
  File "test_tarfile.py", line 32, in <module>
    test_multiproc()
  File "test_tarfile.py", line 24, in test_multiproc
    pool.map(read_file, files)
  File "/ldata/whitcomb/epd-7.1-2-rh5-x86_64/lib/python2.7/multiprocessing/pool.py", line 225, in map
    return self.map_async(func, iterable, chunksize).get()
  File "/ldata/whitcomb/epd-7.1-2-rh5-x86_64/lib/python2.7/multiprocessing/pool.py", line 522, in get
    raise self._value
ValueError: I/O operation on closed file

The real program is more complicated, but this is an example of what I am doing that reproduces the error:

from multiprocessing.pool import ThreadPool, Pool
import StringIO
import tarfile

def write_tar():
    tar = tarfile.open('test.tar', 'w')
    contents = 'line1'
    info = tarfile.TarInfo('file1.txt')
    info.size = len(contents)
    tar.addfile(info, StringIO.StringIO(contents))
    tar.close()

def test_multithread():
    tar   = tarfile.open('test.tar')
    files = [tar.extractfile(member) for member in tar.getmembers()]
    pool  = ThreadPool(processes=1)
    pool.map(read_file, files)
    tar.close()

def test_multiproc():
    tar   = tarfile.open('test.tar')
    files = [tar.extractfile(member) for member in tar.getmembers()]
    pool  = Pool(processes=1)
    pool.map(read_file, files)
    tar.close()

def read_file(f):
    print f.read()

write_tar()
test_multithread()
test_multiproc()

, - , TarInfo , TarFile , , . tarball ?

+5
1

TarInfo , tar.extractfile(member) , member TarInfo. extractfile(...) - , , , read(), tar, tar = tarfile.open('test.tar').

, . test_multiproc() :

def test_multiproc():
    tar   = tarfile.open('test.tar')
    files = [name for name in tar.getnames()]
    pool  = Pool(processes=1)
    result = pool.map(read_file2, files)
    tar.close()

:

def read_file2(name):
    t2 = tarfile.open('test.tar')
    print t2.extractfile(name).read()
    t2.close()

.

+5

All Articles