First, split the columns:
In [11]: from collections import defaultdict pos = defaultdict(list) vals = defaultdict(list) In [12]: for i, c in enumerate(df_dummies.columns): if "_" in c: k, v = c.split("_", 1) pos[k].append(i) vals[k].append(v) else: pos["_"].append(i) In [13]: pos Out[13]: defaultdict(list, {'_': [0], 'm': [1, 2, 3], 'qj': [4, 5, 6]}) In [14]: vals Out[14]: defaultdict(list, {'m': ['M1', 'M2', 'M7'], 'qj': ['q23', 'q4', 'q9']})
This allows you to slice different frames for each dummy column:
In [15]: df_dummies.iloc[:, pos["m"]] Out[15]: m_M1 m_M2 m_M7 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 4 0 1 0 5 1 0 0
Now we can use numpy argmax:
In [16]: np.argmax(df_dummies.iloc[:, pos["m"]].values, axis=1) Out[16]: array([0, 1, 2, 0, 1, 0])
* Note: pandas idxmax returns the label, we want this position so that we can use categorical elements. *
In [17]: pd.Categorical.from_codes(np.argmax(df_dummies.iloc[:, pos["m"]].values, axis=1), vals["m"]) Out[17]: [M1, M2, M7, M1, M2, M1] Categories (3, object): [M1, M2, M7]
Now we can put it all together:
In [21]: df = pd.DataFrame({k: pd.Categorical.from_codes(np.argmax(df_dummies.iloc[:, pos[k]].values, axis=1), vals[k]) for k in vals}) In [22]: df Out[22]: m qj 0 M1 q23 1 M2 q4 2 M7 q9 3 M1 q23 4 M2 q23 5 M1 q9
and return non-empty columns:
In [23]: df[df_dummies.columns[pos["_"]]] = df_dummies.iloc[:, pos["_"]] In [24]: df Out[24]: m qj Budget 0 M1 q23 39 1 M2 q4 15 2 M7 q9 13 3 M1 q23 53 4 M2 q23 82 5 M1 q9 70
As a function:
def reverse_dummy(df_dummies): pos = defaultdict(list) vals = defaultdict(list) for i, c in enumerate(df_dummies.columns): if "_" in c: k, v = c.split("_", 1) pos[k].append(i) vals[k].append(v) else: pos["_"].append(i) df = pd.DataFrame({k: pd.Categorical.from_codes( np.argmax(df_dummies.iloc[:, pos[k]].values, axis=1), vals[k]) for k in vals}) df[df_dummies.columns[pos["_"]]] = df_dummies.iloc[:, pos["_"]] return df In [31]: reverse_dummy(df_dummies) Out[31]: m qj Budget 0 M1 q23 39 1 M2 q4 15 2 M7 q9 13 3 M1 q23 53 4 M2 q23 82 5 M1 q9 70