import pandas as pd
df = pd.DataFrame({'A': ['a', 'a', 'a', 'a', 'c', 'c', 'v'],
'B': ['d', 'd', 'h', 'i', 'i', 'g', 'g'],
'C': ['ii', 'g', 'g', 'k', 'k', 'ii', 'p'],
'D': ['domain', 'domain', 'domain', 'motif',
'motif', 'motif', 'domain']})
n = [name for name,g in df.groupby('A')]
d= [[name]*g['A'].count() + g[['B','C','D']].values.flatten().tolist() for name, g in df.groupby('A')]
rslt = pd.DataFrame([dict((x,r.count(x)) for x in r) for r in d]).fillna(0)
rslt['count'] = rslt[n].sum(axis=1)
rslt.set_index(pd.Index(n), inplace=True)
rslt.drop(n, axis=1, inplace=True)
:
d
Out[138]:
[['a',
'a',
'a',
'a',
'd',
'ii',
'domain',
'd',
'g',
'domain',
'h',
'g',
'domain',
'i',
'k',
'motif'],
['c', 'c', 'i', 'k', 'motif', 'g', 'ii', 'motif'],
['v', 'g', 'p', 'domain']]
DataFrame. python build-int count, , generator. NaN 0.
pd.DataFrame([dict((x,r.count(x)) for x in r) for r in d]).fillna(0)
Out[141]:
a c d domain g h i ii k motif p v
0 4 0 2 3 2 1 1 1 1 1 0 0
1 0 2 0 0 1 0 1 1 1 2 0 0
2 0 0 0 1 1 0 0 0 0 0 1 1
DataFrame
rslt
Out[143]:
d domain g h i ii k motif p count
a 2 3 2 1 1 1 1 1 0 4
c 0 0 1 0 1 1 1 2 0 2
v 0 1 1 0 0 0 0 0 1 1