I have this code below. It's amazing to me that it works for columns, not rows.
import pandas as pd
def summarizing_data_variables(df):
numberRows=size(df['ID'])
numberColumns=size(df.columns)
summaryVariables=np.empty([numberColumns,2], dtype = np.dtype('a50'))
cont=-1
for column in df.columns:
cont=cont+1
summaryVariables[cont][0]=column
summaryVariables[cont][1]=size(df[df[column].isin([0])][column])/(1.0*numberRows)
print summaryVariables
def summarizing_data_users(fileName):
print "Sumarizing users..."
numberRows=size(df['ID'])
numberColumns=size(df.columns)
summaryVariables=np.empty([numberRows,2], dtype = np.dtype('a50'))
cont=-1
for row in df['ID']:
cont=cont+1
summaryVariables[cont][0]=row
dft=df[df['ID']==row]
proportionZeros=(size(dft[dft.isin([0])])-1)/(1.0*(numberColumns-1))
summaryVariables[cont][1]=proportionZeros
print summaryVariables
if __name__ == '__main__':
df = pd.DataFrame([[1, 2, 3], [2, 5, 0.0],[3,4,5]])
df.columns=['ID','var1','var2']
print df
summarizing_data_variables(df)
summarizing_data_users(df)
Output:
ID var1 var2
0 1 2 3
1 2 5 0
2 3 4 5
[['ID' '0.0']
['var1' '0.0']
['var2' '0.333333333333']]
Sumarizing users...
[['1' '1.0']
['2' '1.0']
['3' '1.0']]
I expected for users:
Sumarizing users...
[['1' '0.0']
['2' '0.5']
['3' '0.0']]
It seems that the problem is in this line:
Tsp [dft.isin ([0])]
It does not limit dft to "True", as in the first case.
Can you help me? (1) How to fix part of users (ROWS) (second function above)? (2) Is this the most effective method for this? [My database is very large]
EDIT:
summaryizing_data_variables (df) . Id ( ), var1 ( , ), var2 ( 1/3). 2D numpy.array, , - .
summaryizing_data_users , . .