Improving the performance of the comparison algorithm np.packbits (A == A [:, None], axis = 1)

I'm looking for memory optimization np.packbits(A==A[:, None], axis=1), where Ais a dense array of integers long n. A==A[:, None]is a hungry memory for large ones n, since the resulting Boolean array is stored inefficiently with each 1-byte Boolean value.

I wrote a script below to achieve the same result while packing bits in one section. This, however, is about 3 times slower, so I'm looking for ways to speed it up. Or, alternatively, a better algorithm with low memory overhead.

Note: this is the next question I asked earlier; Comparing a numpy array with itself over an element is efficient .

The reproducible code below is for benchmarking.

import numpy as np
from numba import jit

@jit(nopython=True)
def bool2int(x):
    y = 0
    for i, j in enumerate(x):
        if j: y += int(j)<<(7-i)
    return y

@jit(nopython=True)
def compare_elementwise(arr, result, section):
    n = len(arr)
    for row in range(n):
        for col in range(n):

            section[col%8] = arr[row] == arr[col]

            if ((col + 1) % 8 == 0) or (col == (n-1)):
                result[row, col // 8] = bool2int(section)
                section[:] = 0

    return result

n = 10000
A = np.random.randint(0, 1000, n)

result_arr = np.zeros((n, n // 8 if n % 8 == 0 else n // 8 + 1)).astype(np.uint8)
selection_arr = np.zeros(8).astype(np.uint8)

# memory efficient version, but slow
packed = compare_elementwise(A, result_arr, selection_arr)

# memory inefficient version, but fast
packed2 = np.packbits(A == A[:, None], axis=1)

assert (packed == packed2).all()

%timeit compare_elementwise(A, result_arr, selection_arr)  # 1.6 seconds
%timeit np.packbits(A == A[:, None], axis=1)  # 0.460 second
+2
1

3 , numpy (a.size 8, . ):

@nb.njit
def comp(a):
    res=np.zeros((a.size,a.size//8),np.uint8)
    for i,x in enumerate(a):
        for j,y in enumerate(a):
            if x==y: res[i,j//8] |= 128 >> j%8 
    return res

, , , .

In [122]: %timeit np.packbits(A == A[:, None], axis=1)
389 ms ± 57.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [123]: %timeit comp(A)
123 ms ± 24.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

a.size%8 > 0, . ( range(7)) .

:

if A.size % 8 != 0: A = np.pad(A, (0, 8 - A.size % 8), 'constant', constant_values=0)
+1

All Articles