Let's say I have two numpy arrays, A form (d, f) and I form (d,) containing indices at 0..n , for example.
I = np.array([0, 0, 1, 0, 2, 1]) A = np.arange(12).reshape(6, 2)
I am looking for a quick way to make abbreviations, in particular sum , mean and max , across all slices A[I == i, :] ; slow version will be
results = np.zeros((I.max() + 1, A.shape[1])) for i in np.unique(I): results[i, :] = np.mean(A[I == i, :], axis=0)
which gives in this case
results = [[ 2.66666667, 3.66666667], [ 7. , 8. ], [ 8. , 9. ]])
EDIT : I did some timings based on Divacar's answer and previous pandas answer (deleted).
Time Code:
from __future__ import division, print_function import numpy as np, pandas as pd from time import time np.random.seed(0) d = 500000 f = 500 n = 500 I = np.hstack((np.arange(n), np.random.randint(n, size=(d - n,)))) np.random.shuffle(I) A = np.random.rand(d, f) def reduce_naive(A, I, op="avg"): target_dtype = (np.float if op=="avg" else A.dtype) results = np.zeros((I.max() + 1, A.shape[1]), dtype=target_dtype) npop = {"avg": np.mean, "sum": np.sum, "max": np.max}.get(op) for i in np.unique(I): results[i, :] = npop(A[I == i, :], axis=0) return results def reduce_reduceat(A, I, op="avg"): sidx = I.argsort() sI = I[sidx] sortedA = A[sidx] idx = np.r_[ 0, np.flatnonzero(sI[1:] != sI[:-1])+1 ] if op == "max": return np.maximum.reduceat(sortedA, idx, axis=0) sums = np.add.reduceat(sortedA, idx, axis=0) if op == "sum": return sums if op == "avg": count = np.r_[idx[1:] - idx[:-1], A.shape[0] - idx[-1]] return sums/count.astype(float)[:,None] def reduce_bincount(A, I, op="avg"): ids = (I[:,None] + (I.max()+1)*np.arange(A.shape[1])).ravel() sums = np.bincount(ids, A.ravel()).reshape(A.shape[1],-1).T if op == "sum": return sums if op == "avg": return sums/np.bincount(ids).reshape(A.shape[1],-1).T def reduce_pandas(A, I, op="avg"): group = pd.concat([pd.DataFrame(A), pd.DataFrame(I, columns=("i",)) ], axis=1 ).groupby('i') if op == "sum": return group.sum().values if op == "avg": return group.mean().values if op == "max": return group.max().values def reduce_hybrid(A, I, op="avg"): sidx = I.argsort() sI = I[sidx] sortedA = A[sidx] idx = np.r_[ 0, np.flatnonzero(sI[1:] != sI[:-1])+1 ] unq_sI = sI[idx] m = I.max()+1 N = A.shape[1] target_dtype = (np.float if op=="avg" else A.dtype) out = np.zeros((m,N),dtype=target_dtype) ss_idx = np.r_[idx,A.shape[0]] npop = {"avg": np.mean, "sum": np.sum, "max": np.max}.get(op) for i in range(len(idx)): out[unq_sI[i]] = npop(sortedA[ss_idx[i]:ss_idx[i+1]], axis=0) return out for op in ("sum", "avg", "max"): for name, method in (("naive ", reduce_naive), ("reduceat", reduce_reduceat), ("pandas ", reduce_pandas), ("bincount", reduce_bincount), ("hybrid ", reduce_hybrid) ("numba ", reduce_numba) ): if op == "max" and name == "bincount": continue
Timings:
naive sum 1.10 reduceat sum 4.62 pandas sum 5.29 bincount sum 1.54 hybrid sum 0.62 numba sum 0.31 naive avg 1.12 reduceat avg 4.45 pandas avg 5.23 bincount avg 2.43 hybrid avg 0.61 numba avg 0.33 naive max 1.19 reduceat max 3.18 pandas max 5.24 hybrid max 0.72 numba max 0.34
(I chose d and n as typical values ββfor my use case - I added code for numba versions in my answer).