MemoryError when creating a cartesian product in Numpy

I have 3 numpy arrays and you need to generate cartesian products between them. The sizes of the arrays are not fixed, so they can take different values, one example can be A = (10000, 50), B = (40, 50), C = (10000.50).

Then I do some processing (e.g. a + bc). Below is the function that I use for the product.

def cartesian_2d(arrays, out=None):

    arrays = [np.asarray(x) for x in arrays]
    dtype = arrays[0].dtype

    n = np.prod([x.shape[0] for x in arrays])
    if out is None:
        out = np.empty([n, len(arrays), arrays[0].shape[1]], dtype=dtype)

    m = n // arrays[0].shape[0]
    out[:, 0] = np.repeat(arrays[0], m, axis=0)
    if arrays[1:]:
        cartesian_2d(arrays[1:], out=out[0:m, 1:, :])
        for j in range(1, arrays[0].shape[0]):
            out[j * m:(j + 1) * m, 1:] = out[0:m, 1:]
    return out

a = [[ 0, -0.02], [1, -0.15]]
b = [[0, 0.03]]

result = cartesian_2d([a,b,a])

// array([[[ 0.  , -0.02],
    [ 0.  ,  0.03],
    [ 0.  , -0.02]],

   [[ 0.  , -0.02],
    [ 0.  ,  0.03],
    [ 1.  , -0.15]],

   [[ 1.  , -0.15],
    [ 0.  ,  0.03],
    [ 0.  , -0.02]],

   [[ 1.  , -0.15],
    [ 0.  ,  0.03],  
    [ 1.  , -0.15]]])

The output is the same as with itertools.product. However, I use my custom function to use numpy vectorized functions that work fine compared to itertools.product in my case.

After that i do

result[:, 0, :] + result[:, 1, :] - result[:, 2, :]

//array([[ 0.  ,  0.03],
       [-1.  ,  0.16],
       [ 1.  , -0.1 ],
       [ 0.  ,  0.03]])

So this is the final expected result.

, . usecase , MemoryError np.empty(), . 20 , .

float, int. , , sparse .

, . memmap/h5py , .

, .

, , , , - , . , .

+6
3

, . .

, NumPy , , , :

#shared dimensions:
sh = a.shape[1:]
aba = (a[:, None, None] + b[None, :, None] - a[None, None, :]).reshape(-1, *sh)
aba
#array([[ 0.  ,  0.03],
#       [-1.  ,  0.16],
#       [ 1.  , -0.1 ],
#       [ 0.  ,  0.03]])

'ID'

reshape. . 0,1,2,... , , . , aba [1,0,0] , b a.

Broadcasting: , , , , - . :

    [[2],                 [[7],   [[2],
7 +  [3],     equiv to     [7], +  [3],
     [4]]                  [7]]    [4]]

:

              [[4],            [[1, 2, 3],   [[4, 4, 4],
[[1, 2, 3]] +  [5],  equiv to   [1, 2, 3], +  [5, 5, 5],
               [6]]             [1, 2, 3]]    [6, 6, 6]]

1 , 1. , , . , , , .

, , h5py - .

:

a_no_id = a[:, 1:]

... , , Python, NumPy , . ( ) .

+3

( , 1D-):

idx = cartesian_product(
    np.arange(len(a)),
    np.arange(len(b)) + len(a),
    np.arange(len(a))
)

:

x = np.concatenate((a, b))
result = x[idx.ravel(), :].reshape(*idx.shape, -1)
+1

.

size_in_GB = A.shape[0]**2*A.shape[1]*B.shape[0]*(size_of_datatype)/1e9

A.shape = (10000,50), B = (40,50). float64, 1600 . , , , . , , .

, , 1600 ( 200 ). 200 / .

, , @PaulPanzer.

import numpy as np
import tables #register blosc
import h5py as h5
import h5py_cache as h5c

a=np.arange(500*50).reshape(500, 50)
b=np.arange(40*50).reshape(40, 50)

# isn't well documented, have a look at https://github.com/Blosc/hdf5-blosc
compression_opts=(0, 0, 0, 0, 5, 1, 1)
compression_opts[4]=9 #compression level 0...9
compression_opts[5]=1 #shuffle
compression_opts[6]=1 #compressor (I guess that lz4)

File_Name_HDF5='Test.h5'
f = h5.File(File_Name_HDF5, 'w',chunk_cache_mem_size=1024**2*300)
dset = f.create_dataset('Data', shape=(a.shape[0]**2*b.shape[0],a.shape[1]),dtype='d',chunks=(a.shape[0]*b.shape[0],1),compression=32001,compression_opts=(0, 0, 0, 0, 9, 1, 1), shuffle=False)

#Write the data
for i in range(a.shape[0]):
  sh = a.shape[1:]
  aba = (a[i] + b[:, None] - a).reshape(-1, *sh)
  dset[i*a.shape[0]*b.shape[0]:(i+1)*a.shape[0]*b.shape[0]]=aba

f.close()

File_Name_HDF5='Test.h5'
f = h5c.File(File_Name_HDF5, 'r',chunk_cache_mem_size=1024**2*300)
dset=f['Data']
chunks_size=500
for i in range(0,dset.shape[0],chunks_size):
  #Iterate over the first column
  data=dset[i:i+chunks_size,:] #avoid excessive calls to the hdf5 library
  #Do something with the data

f.close()

f = h5c.File(File_Name_HDF5, 'r',chunk_cache_mem_size=1024**2*300)
dset=f['Data']
for i in range(dset.shape[1]):
  # Iterate over the second dimension
  # fancy indexing e.g.[:,i] will be much slower
  # use np.expand_dims or in this case np.squeeze after the read operation from the dset
  # if you wan't to have the same result than [:,i] (1 dim array)
  data=dset[:,i:i+1] 
  #Do something with the data

f.close()

550 /, 500 /, , 1000 / ) 50. Numpy , ( C), , HDF5, . memmap Numpy, .

, . , .

If you are doing something completely wrong, the ability to be 10-100 times slower compared to the right way to do it (for example, chunkshape can be optimized for the first or second read example).

+1
source

All Articles