notebook.community

Edit and run



In [ ]:

    
"""
Inspired from the "Introduction to Statistical Learning" of James et al.
where we see a scatter plot of two variables and  the corresponding boxplots.

It can be particularily useful for a binary classification case where we have to pass
the target binary variable too.
"""



In [1]:

    
# necessary imports
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../lib_plot")
#import db_to_pd as ddd
import scatter_boxplot as sbp

%matplotlib inline









    



/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pytz/__init__.py:29: UserWarning: Module argparse was already imported from /Users/charilaostsarouchas/anaconda/lib/python2.7/argparse.pyc, but /Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream



In [2]:

    
# download the data concerning the defaults of credit cards 
# — the dataset is provided by James et al., Introduction to Statistical Learning.

# sample found online in this link http://www.datarobot.com/blog/classification-with-scikit-learn/
# or maybe here:
#http://www-bcf.usc.edu/~gareth/ISL/Credit.csv


df = pd.read_csv('https://d1pqsl2386xqi9.cloudfront.net/notebooks/Default.csv', index_col=0)
# we need to transform the default var to boolean
df['target'] = df['default'].apply(lambda x: True if x=='Yes' else False)
df.head()









    Out[2]:






  
    
      
      default
      student
      balance
      income
      target
    
  
  
    
      1
       No
        No
        729.526495
       44361.625074
       False
    
    
      2
       No
       Yes
        817.180407
       12106.134700
       False
    
    
      3
       No
        No
       1073.549164
       31767.138947
       False
    
    
      4
       No
        No
        529.250605
       35704.493935
       False
    
    
      5
       No
        No
        785.655883
       38463.495879
       False



In [3]:

    
# see the scatter plot and the relevant boxplots
sbp.plot_2d_data(df, 'balance', 'income', 'target' , filename=None, title='', xlabel='', ylabel='')



In [ ]:

	default	student	balance	income	target
1	No	No	729.526495	44361.625074	False
2	No	Yes	817.180407	12106.134700	False
3	No	No	1073.549164	31767.138947	False
4	No	No	529.250605	35704.493935	False
5	No	No	785.655883	38463.495879	False