Filter multiple NumPy arrays based on intersection of a single column

I have three fairly large arrays NumPywith varying numbers of rows whose first columns are all integers. My hope is to filter out these arrays so that only the remaining rows remain for which the value in the first column is shared by all three. This would leave three arrays of the same size. Records in other columns are not necessarily split between arrays.

So, with the input:

A = 
[[1, 1],
[2, 2],
[3, 3],]

B = 
[[2, 1],
[3, 2],
[4, 3],
[5, 4]]

C = 
[[2, 2],
[3, 1]
[5, 2]]

I hope to return as output:

A = 
[[2, 2],
[3, 3]]


B = 
[[2, 1],
[3, 2]]

C = 
[[2, 2],
[3, 1]]

My current approach is as follows:

  • Find the intersection of the first three columns using numpy.intersect1d()

  • numpy.in1d() , , ( boolean , : Python: )

  • , numpy.delete(), .

, Pythonic , , -, .

+4
3

. , ( ), :

import numpy as np

A = np.array(
[[1, 1],
[2, 2],
[3, 3],])

B = np.array(
[[2, 1],
[3, 2],
[4, 3],
[5, 4]])

C = np.array(
[[2, 2],
[3, 1],
[5, 2],])

I = reduce(
    lambda l,r: np.intersect1d(l,r,True),
    (i[:,0] for i in (A,B,C)))

print A[np.searchsorted(A[:,0], I)]
print B[np.searchsorted(B[:,0], I)]
print C[np.searchsorted(C[:,0], I)]

, ( - ):

C = np.array(
[[9, 2],
[1,6],
[5, 1],
[2, 5],
[3, 2],])

def index_by_first_column_entry(M, keys):
    colkeys = M[:,0]
    sorter = np.argsort(colkeys)
    index = np.searchsorted(colkeys, keys, sorter = sorter)
    return M[sorter[index]]

print index_by_first_column_entry(C, I)

true false

I = reduce(
    lambda l,r: np.intersect1d(l,r,False),
    (i[:,0] for i in (A,B,C)))

np.unique

+3

- -, , , . . - :

import numpy as np

# Setup
A = np.array(
[[1, 1],
[2, 2],
[3, 3],])

B = np.array(
[[2, 1],
[3, 2],
[4, 3],
[5, 4]])

C = np.array(
[[2, 2],
[3, 1],
[5, 2],])


def take_overlap(*input):
    n = len(input)
    maxIndex = max(array[:, 0].max() for array in input)
    indicator = np.zeros(maxIndex + 1, dtype=int)
    for array in input:
        indicator[array[:, 0]] += 1
    indicator = indicator == n

    result = []
    for array in input:
        # Look up each integer in the indicator array
        mask = indicator[array[:, 0]]
        # Use boolean indexing to get the sub array
        result.append(array[mask])

    return result

subA, subB, subC = take_overlap(A, B, C)

, , . , , .. [1, 10, 10000], , .

+2

This works, but I'm not sure if this is faster than any other answer:

import numpy as np

A = np.array(
[[1, 1],
[2, 2],
[3, 3],])

B = np.array(
[[2, 1],
[3, 2],
[4, 3],
[5, 4]])

C = np.array(
[[2, 2],
[3, 1],
[5, 2],])

a = A[:,0]
b = B[:,0]
c = C[:,0]

ab = np.where(a[:, np.newaxis] == b[np.newaxis, :])
bc = np.where(b[:, np.newaxis] == c[np.newaxis, :])

ab_in_bc = np.in1d(ab[1], bc[0])
bc_in_ab = np.in1d(bc[0], ab[1])

arows = ab[0][ab_in_bc]
brows = ab[1][ab_in_bc]
crows = bc[1][bc_in_ab]

anew = A[arows, :]
bnew = B[brows, :]
cnew = C[crows, :]

print(anew)
print(bnew)
print(cnew)

gives:

[[2 2]
 [3 3]]
[[2 1]
 [3 2]]
[[2 2]
 [3 1]]
0
source

All Articles