Pandas 12 Utils


In [ ]:
import pandas as pd
import numpy as np
import scipy.stats as stat

data = pd.read_csv('./res/train.csv', index_col='Loan_ID')

data[:10]

1. boolean index


In [ ]:
conds = (data['Gender']=='Female') & (data['Education']=='Not Graduate') & (data['Loan_Status'] == 'Y')
print(conds[:10])
data.loc[conds, ['Gender', 'Education', 'Loan_Status']]

2. apply function


In [ ]:
def num_null(x):
    return sum(pd.isnull(x))

print('column null count:')
print(data.apply(func=num_null, axis=0))
print('\nrow null count:')
print(data.apply(func=num_null, axis=1).head())

3. fill null value


In [ ]:
s1 = data['Gender']
g = s1.mode()
print("type:", type(g), " col0:", g[0])
# stat.mode(s1.dropna())
data['Gender'].fillna(g[0], inplace=True)
data['Married'].fillna(data['Married'].mode()[0], inplace=True)
data['Self_Employed'].fillna(data['Self_Employed'].mode()[0], inplace=True)

print(sum(data['Gender'].isnull()), sum(data['Married'].isnull()), sum(data['Self_Employed'].isnull()))

4. pivot table


In [ ]:
# 2 x 2 x 2
v1 = data.pivot_table(values=['LoanAmount'], index=['Gender', 'Married', 'Self_Employed'], aggfunc=np.mean)
print(v1)

5. composite index


In [ ]:
print(sum(data['LoanAmount'].isnull()))
for i, row in data.loc[data['LoanAmount'].isnull(), :].iterrows():
    t = tuple([row['Gender'], row['Married'], row['Self_Employed']])
    data.loc[i, 'LoanAmount'] = v1.loc[t].values[0]
 
print("Again check loanAmount na count:")
data.apply(func=num_null, axis=0)
data.apply()

6. crosstab


In [ ]:
ct = pd.crosstab(index=data['Credit_History'], columns=data['Loan_Status'], margins=True)
print(ct)

def colums_per_row_cal(row):
    return row / float(row[-1])

ct = pd.crosstab(data['Credit_History'], data['Loan_Status'], margins=True).apply(func=colums_per_row_cal, axis=1)
ct

7. merge dataframe


In [ ]:
prop_rates = pd.DataFrame(data=[1000, 5000, 12000], index=['Rural','Semiurban','Urban'],columns=['rates'])
prop_rates

In [ ]:
merge_data = data.merge(right=prop_rates, how='inner', left_on='Property_Area', right_index=True, sort=False)
# merge_data[:5].loc[:,['Credit_History', 'Property_Area', 'rates']]
merge_data[['Credit_History', 'Property_Area', 'rates']].head(5)

In [ ]:
# merge_data[-5:].loc[:,['Credit_History', 'Property_Area', 'rates']]
merge_data[['Credit_History', 'Property_Area', 'rates']].tail(5)

In [ ]:
# values is not usefull, only for simple calculate (len), can use other colums instead
merge_data.pivot_table(values='Credit_History', index=['Property_Area', 'rates'], aggfunc=len)

8. sort dataframe


In [ ]:
sorted_data = data.sort_values(['ApplicantIncome', 'CoapplicantIncome'], ascending=False)
sorted_data[['ApplicantIncome', 'CoapplicantIncome']].head(5)

9. boxplot and histogram


In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline

In [ ]:
data.boxplot(column='ApplicantIncome', by='Loan_Status')

In [ ]:
data.hist(column='ApplicantIncome', by='Loan_Status', bins=30)

10. cut bins


In [ ]:
def binning(col, cut_points, cut_labels):
    minval = col.min()
    maxval = col.max()
    break_points = [minval] + cut_points + [maxval]
    return pd.cut(x=col, bins=break_points, labels=cut_labels, include_lowest=True)
cut_points = [90, 140, 190]
# [20] + cut_points + [220]
labels = ['low', 'medium', 'high', 'very high']
data['LoanAmount_Bin'] = binning(data['LoanAmount'], cut_points, labels)
data[['LoanAmount_Bin', 'LoanAmount']].head(10)

11. replace (name varibal encoding)


In [ ]:
def encoding(col, codedict):
    colCoded = pd.Series(col, copy=True)
    for k, v in codedict.items():
        colCoded.replace(k, v, inplace=True)
    return colCoded;

data['Loan_Status_Coded'] = encoding(data['Loan_Status'], {'N':0, 'Y':1})
data[['Loan_Status', 'Loan_Status_Coded']].head(10)

12. row iterator


In [ ]:
data.dtypes

In [ ]:
data['Credit_History'].head(5)

In [ ]:
# skiprows: Loan_ID
data_types = pd.read_csv('./res/datatypes.csv', skiprows=[1])
data_types.tail(10)

In [ ]:
for i, row in data_types.iterrows():
    if row['type'] == 'categorical':
        data[row['feature']] = data[row['feature']].astype(np.object)
    elif row['type'] == 'continuous':
        data[row['feature']] = data[row['feature']].astype(np.float)
data.dtypes