I ran this in Jupyter and put the exampledata.txt file in the same directory as the laptop.
Note that the first line:
df = pd.DataFrame(datas, columns=['userid', 'recency', 'frequency', 'monetary'])
loads colors 'userid'when it is not defined in the data file. I deleted this column name.
Decision
import pandas as pd
def pct_rank_qcut(series, n):
edges = pd.Series([float(i) / n for i in range(n + 1)])
f = lambda x: (edges >= x).argmax()
return series.rank(pct=1).apply(f)
datas = pd.read_csv('./exampledata.txt', delimiter=';')
df = pd.DataFrame(datas, columns=['recency', 'frequency', 'monetary'])
df['recency'] = df['recency'].astype(float)
df['frequency'] = df['frequency'].astype(float)
df['monetary'] = df['monetary'].astype(float)
df['recency'] = pct_rank_qcut(df.recency, 5)
df['frequency'] = pct_rank_qcut(df.frequency, 5)
df['monetary'] = pct_rank_qcut(df.monetary, 5)
Explanation
, , pd.qcut, 5 . , , 'frequency' 28% 1. qcut.
pct_rank_qcut, 1 .
edges = pd.Series([float(i) / n for i in range(n + 1)])
, n. n = 5 [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
f = lambda x: (edges >= x).argmax()
, . edges >= x , , edges, True False , x . x = 0.14 (edges >= x) [False, True, True, True, True, True]. argmax(), , True, 1.
return series.rank(pct=1).apply(f)
series . , , apply(f). , , 1 n. - , :
pd.qcut(df['recency'].values, 5).codes + 1
, bin 1 2. - . , .
print df.head()
recency frequency monetary
0 3 5 5
1 2 5 5
2 2 5 5
3 1 5 5
4 2 5 5
Update
pd.Series.argmax() . pd.Series.values.argmax()() !
def pct_rank_qcut(series, n):
edges = pd.Series([float(i) / n for i in range(n + 1)])
f = lambda x: (edges >= x).values.argmax()
return series.rank(pct=1).apply(f)