Here's one approach -
1) Auxiliary functions:
def argsort_unique(idx):
Perhaps a faster alternative with a helper function based on a cyclic section:
def get_bin_arr(grplens, stop1_idx): stop1_idx_corr = np.minimum(stop1_idx, grplens) clens = grplens.cumsum() out = np.zeros(clens[-1],dtype=int) out[:stop1_idx_corr[0]] = 1 for i,j in zip(clens[:-1], clens[:-1] + stop1_idx_corr[1:]): out[i:j] = 1 return out
2) Main function:
def out_C(A, selDict): k = np.array(selDict.keys()) v = np.array(selDict.values()) unq, C = np.unique(A, return_counts=1) sidx3 = np.searchsorted(unq, k) lims = np.zeros(len(unq),dtype=int) lims[sidx3] = v bin_arr = get_bin_arr(C, lims) sidx2 = A.argsort() out = bin_arr[argsort_unique(sidx2)] return out
Run Examples -
Original approach:
def org_app(df, selDict): df['C'] = 0 d = selDict.copy() for i, r in df.iterrows(): if d[r["A"]] > 0: d[r["A"]] -=1 df.set_value(i, 'C', 1) return df
Case No. 1:
>>> df = pd.DataFrame({'A': 'foo bar foo bar res foo bar res foo foo res'.split()}) >>> selDict = {"foo":2, "bar":3, "res":1} >>> org_app(df, selDict) AC 0 foo 1 1 bar 1 2 foo 1 3 bar 1 4 res 1 5 foo 0 6 bar 1 7 res 0 8 foo 0 9 foo 0 10 res 0 >>> out_C(df.A.values, selDict) array([1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
Case No. 2:
>>> selDict = {"foo":20, "bar":30, "res":10} >>> org_app(df, selDict) AC 0 foo 1 1 bar 1 2 foo 1 3 bar 1 4 res 1 5 foo 1 6 bar 1 7 res 1 8 foo 1 9 foo 1 10 res 1 >>> out_C(df.A.values, selDict) array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])