this notebook will extract the relevant dataset from Kaggle dataset to predict default
In [2]:
import pandas as pd
In [3]:
df = pd.read_csv('loan.csv')
In [4]:
len(df)
Out[4]:
In [5]:
df.head()
Out[5]:
In [6]:
df.columns
Out[6]:
In [8]:
df[df.index<10].to_csv('sample.csv')
In [9]:
df.loan_status.unique()
Out[9]:
In [10]:
in_default = ['Charged Off', 'Default', 'Late (31-120 days)', 'Late (16-30 days)', 'Does not meet the credit policy. Status:Charged Off']
In [11]:
len(in_default)
Out[11]:
In [27]:
df['in_default'] = df.loan_status.map(lambda x: 1 if x in in_default else 0)
In [28]:
df.in_default.unique()
Out[28]:
In [30]:
len(df[df.in_default==1]) / len(df)
Out[30]:
In [31]:
cols = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv',
'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
'issue_d', 'loan_status', 'pymnt_plan', 'desc', 'purpose',
'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
'earliest_cr_line', 'inq_last_6mths',
'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'last_pymnt_amnt',
'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
'mths_since_last_major_derog', 'policy_code', 'application_type',
'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'in_default']
In [32]:
df[cols].to_csv('default.csv', index=False)
In [20]:
# df1 = pd.DataFrame(cols)
In [22]:
# df1.to_csv('desc.csv', index=False)
In [ ]: