In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import roc_auc_score

In [5]:
# Display/formatting settings
pd.set_option('display.max_columns', None)

In [3]:
# Extract data
train_data = pd.read_csv("default_risk_train_data.csv")
test_data = pd.read_csv("default_risk_test_data.csv")

In [6]:
train_data.head()


Out[6]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
0 406644 0 Cash loans F N Y 1 202500.0 976711.5 49869.0 873000.0 Unaccompanied Commercial associate Secondary / secondary special Married House / apartment 0.046220 -15743 -4482 -1797.0 -2455 NaN 1 1 0 1 0 1 Laborers 3.0 1 1 TUESDAY 14 0 1 1 0 0 0 Other 0.655600 0.684298 NaN 0.4000 0.1494 0.9970 0.9592 0.2377 0.48 0.2069 0.6250 0.5833 0.0368 0.3261 0.4088 0.0386 0.0744 0.4076 0.1550 0.9970 0.9608 0.2399 0.4834 0.2069 0.6250 0.5833 0.0377 0.3563 0.4259 0.0389 0.0787 0.4039 0.1494 0.9970 0.9597 0.2392 0.48 0.2069 0.6250 0.5833 0.0375 0.3318 0.4161 0.0388 0.0759 reg oper account block of flats 0.4835 Stone, brick No 0.0 0.0 0.0 0.0 -1626.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN
1 411997 0 Cash loans M Y Y 0 225000.0 808650.0 26086.5 675000.0 Unaccompanied State servant Higher education Married House / apartment 0.018850 -20659 -10455 -4998.0 -4010 1.0 1 1 0 1 0 1 NaN 2.0 2 2 WEDNESDAY 10 0 0 0 0 0 0 Culture NaN 0.623740 0.710674 0.1701 0.0545 0.9781 NaN NaN 0.32 0.2759 0.2083 NaN NaN NaN 0.1022 NaN 0.1441 0.1733 0.0566 0.9782 NaN NaN 0.3222 0.2759 0.2083 NaN NaN NaN 0.1065 NaN 0.1525 0.1718 0.0545 0.9781 NaN NaN 0.32 0.2759 0.2083 NaN NaN NaN 0.1040 NaN 0.1471 NaN block of flats 0.1512 Stone, brick No 2.0 1.0 2.0 1.0 -1704.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 3.0
2 241559 0 Revolving loans M N Y 0 135000.0 180000.0 9000.0 180000.0 Unaccompanied Commercial associate Secondary / secondary special Single / not married House / apartment 0.007305 -9013 -1190 -3524.0 -1644 NaN 1 1 0 1 0 0 Laborers 1.0 3 3 SUNDAY 11 0 0 0 0 0 0 Construction 0.175511 0.492994 0.085595 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -661.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 4.0
3 296530 0 Cash loans F Y Y 2 135000.0 592560.0 32274.0 450000.0 Unaccompanied Working Higher education Married House / apartment 0.018801 -10495 -744 -160.0 -152 15.0 1 1 0 1 0 0 Laborers 4.0 2 2 WEDNESDAY 9 0 0 0 0 0 0 Business Entity Type 3 0.117463 0.621860 0.579727 0.0619 0.0487 0.9816 0.7484 0.0258 0.00 0.1379 0.1667 0.2083 0.0130 0.0504 0.0509 0.0000 0.0000 0.0630 0.0506 0.9816 0.7583 0.0260 0.0000 0.1379 0.1667 0.2083 0.0133 0.0551 0.0530 0.0000 0.0000 0.0625 0.0487 0.9816 0.7518 0.0260 0.00 0.1379 0.1667 0.2083 0.0133 0.0513 0.0518 0.0000 0.0000 reg oper account block of flats 0.0400 Panel No 0.0 0.0 0.0 0.0 -508.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0
4 231371 0 Cash loans F N N 1 112500.0 512446.5 34375.5 463500.0 Unaccompanied Working Secondary / secondary special Civil marriage House / apartment 0.022625 -10270 -451 -1602.0 -2797 NaN 1 1 0 1 0 1 Accountants 3.0 2 2 TUESDAY 16 0 0 0 0 0 0 Business Entity Type 3 0.310480 0.642651 0.466864 0.0722 0.0000 0.9771 0.6804 0.0256 0.00 0.1379 0.1667 0.0417 0.0115 0.0588 0.0443 0.0000 0.0807 0.0735 0.0000 0.9772 0.6929 0.0258 0.0000 0.1379 0.1667 0.0417 0.0117 0.0643 0.0461 0.0000 0.0855 0.0729 0.0000 0.9771 0.6847 0.0257 0.00 0.1379 0.1667 0.0417 0.0117 0.0599 0.0451 0.0000 0.0824 not specified block of flats 0.0524 Panel No 1.0 0.0 0.0 0.0 -12.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0