Pandas: assign column values ​​to the limit specified by dictionary values

How to remove iterrows() ? Can this be done faster with numpy or pandas?

 import pandas as pd import numpy as np df = pd.DataFrame({'A': 'foo bar foo bar foo bar foo foo'.split(), 'B': 'one one two three two two one three'.split(), 'C': np.arange(8)*0 }) print(df) # ABC # 0 foo one 0 # 1 bar one 0 # 2 foo two 0 # 3 bar three 0 # 4 foo two 0 # 5 bar two 0 # 6 foo one 0 # 7 foo three 0 selDict = {"foo":2, "bar":3} 

It works:

 for i, r in df.iterrows(): if selDict[r["A"]] > 0: selDict[r["A"]] -=1 df.set_value(i, 'C', 1) print df # ABC # 0 foo one 1 # 1 bar one 1 # 2 foo two 1 # 3 bar three 1 # 4 foo two 0 # 5 bar two 1 # 6 foo one 0 # 7 foo three 0 
+2
source share
3 answers

Here's one approach -

1) Auxiliary functions:

 def argsort_unique(idx): # Original idea : http://stackoverflow.com/a/41242285/3293881 by @Andras n = idx.size sidx = np.empty(n,dtype=int) sidx[idx] = np.arange(n) return sidx def get_bin_arr(grplens, stop1_idx): count_stops_corr = np.minimum(stop1_idx, grplens) limsc = np.maximum(grplens, count_stops_corr) L = limsc.sum() starts = np.r_[0,limsc[:-1].cumsum()] shift_arr = np.zeros(L,dtype=int) stops = starts + count_stops_corr stops = stops[stops<L] shift_arr[starts] += 1 shift_arr[stops] -= 1 bin_arr = shift_arr.cumsum() return bin_arr 

Perhaps a faster alternative with a helper function based on a cyclic section:

 def get_bin_arr(grplens, stop1_idx): stop1_idx_corr = np.minimum(stop1_idx, grplens) clens = grplens.cumsum() out = np.zeros(clens[-1],dtype=int) out[:stop1_idx_corr[0]] = 1 for i,j in zip(clens[:-1], clens[:-1] + stop1_idx_corr[1:]): out[i:j] = 1 return out 

2) Main function:

 def out_C(A, selDict): k = np.array(selDict.keys()) v = np.array(selDict.values()) unq, C = np.unique(A, return_counts=1) sidx3 = np.searchsorted(unq, k) lims = np.zeros(len(unq),dtype=int) lims[sidx3] = v bin_arr = get_bin_arr(C, lims) sidx2 = A.argsort() out = bin_arr[argsort_unique(sidx2)] return out 

Run Examples -

Original approach:

 def org_app(df, selDict): df['C'] = 0 d = selDict.copy() for i, r in df.iterrows(): if d[r["A"]] > 0: d[r["A"]] -=1 df.set_value(i, 'C', 1) return df 

Case No. 1:

 >>> df = pd.DataFrame({'A': 'foo bar foo bar res foo bar res foo foo res'.split()}) >>> selDict = {"foo":2, "bar":3, "res":1} >>> org_app(df, selDict) AC 0 foo 1 1 bar 1 2 foo 1 3 bar 1 4 res 1 5 foo 0 6 bar 1 7 res 0 8 foo 0 9 foo 0 10 res 0 >>> out_C(df.A.values, selDict) array([1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) 

Case No. 2:

 >>> selDict = {"foo":20, "bar":30, "res":10} >>> org_app(df, selDict) AC 0 foo 1 1 bar 1 2 foo 1 3 bar 1 4 res 1 5 foo 1 6 bar 1 7 res 1 8 foo 1 9 foo 1 10 res 1 >>> out_C(df.A.values, selDict) array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) 
+2
source

If I understood correctly, you can use cumcount:

 df['C'] = (df.groupby('A').cumcount() < df['A'].map(selDict)).astype('int') df Out: ABC 0 foo one 1 1 bar one 1 2 foo two 1 3 bar three 1 4 foo two 0 5 bar two 1 6 foo one 0 7 foo three 0 
+4
source

scipy.stats.rankdata can help here. To get the rank of each item in our bucket, we use the difference between the min and ordinal methods:

 >>> from scipy.stats import rankdata as rd >>> rd(df.A, 'ordinal') - rd(df.A, 'min') array([0, 0, 1, 1, 2, 2, 3, 4]) 

Then we just compare with df.A.map(selDict) :

 df.C = (rd(df.A, 'ordinal') - rd(df.A, 'min') < df.A.map(selDict)).astype(int) 

It might be a little inefficient (calling rank data twice), but using optimized routines in scipy should make up for this.

If you cannot use scipy, you can use repeated argsort() for the serial number method and my solution using unique and bincount for the min method:

 >>> _, v = np.unique(df.A, return_inverse=True) >>> df.A.argsort().argsort() - (np.cumsum(np.concatenate(([0], np.bincount(v)))))[v] 0 0 1 0 2 1 3 1 4 2 5 2 6 3 7 4 Name: A, dtype: int64 

Then compare with df.A.map(selDict) as above.

+1
source

All Articles