notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt



In [6]:

    
df = pd.read_csv('/Users/ajmendez/Downloads/bank-additional/bank-additional-full.csv',
                 sep=';')



In [7]:

    
df.describe()









    Out[7]:






  
    
      
      age
      duration
      campaign
      pdays
      previous
      emp.var.rate
      cons.price.idx
      cons.conf.idx
      euribor3m
      nr.employed
    
  
  
    
      count
      41188.00000
      41188.000000
      41188.000000
      41188.000000
      41188.000000
      41188.000000
      41188.000000
      41188.000000
      41188.000000
      41188.000000
    
    
      mean
      40.02406
      258.285010
      2.567593
      962.475454
      0.172963
      0.081886
      93.575664
      -40.502600
      3.621291
      5167.035911
    
    
      std
      10.42125
      259.279249
      2.770014
      186.910907
      0.494901
      1.570960
      0.578840
      4.628198
      1.734447
      72.251528
    
    
      min
      17.00000
      0.000000
      1.000000
      0.000000
      0.000000
      -3.400000
      92.201000
      -50.800000
      0.634000
      4963.600000
    
    
      25%
      32.00000
      102.000000
      1.000000
      999.000000
      0.000000
      -1.800000
      93.075000
      -42.700000
      1.344000
      5099.100000
    
    
      50%
      38.00000
      180.000000
      2.000000
      999.000000
      0.000000
      1.100000
      93.749000
      -41.800000
      4.857000
      5191.000000
    
    
      75%
      47.00000
      319.000000
      3.000000
      999.000000
      0.000000
      1.400000
      93.994000
      -36.400000
      4.961000
      5228.100000
    
    
      max
      98.00000
      4918.000000
      56.000000
      999.000000
      7.000000
      1.400000
      94.767000
      -26.900000
      5.045000
      5228.100000



In [8]:

    
f_list = ['education','job','marital','contact',
          'campaign','duration','loan','poutcome']
sorted([(feature,metrics.adjusted_mutual_info_score(df[feature],df['y'])) 
         for feature in f_list],
       key=lambda x: x[1], reverse=True)









    Out[8]:





[('poutcome', 0.063974329495790822),
 ('contact', 0.017724409216299135),
 ('duration', 0.01235247012967758),
 ('job', 0.0047094532425354194),
 ('campaign', 0.0016892994224096314),
 ('marital', 0.0015227678571156829),
 ('education', 0.0013002646581772531),
 ('loan', -1.9812804868203938e-05)]



In [9]:

    
test_size=.33

pp_data = df.copy()
data_labels = pd.Series([0 if x == 'no' else 1 for x in df.ix[:,-1]])

# Scale numeric data
dts_cols = [0,10, 11, 12,13,15,16,17,18,19]
data_to_scale = pp_data.iloc[:, dts_cols].astype(np.float)  # change int to float
scaler = preprocessing.StandardScaler().fit(data_to_scale)
pp_data.iloc[:, dts_cols] = scaler.transform(data_to_scale)

# Create dummy encoding for categorical data
dtde_cols = [1, 2, 3, 4, 5, 6, 7, 8, 9, 14]
data_to_de = pp_data.iloc[:,dtde_cols]
de_data = pd.get_dummies(data_to_de)
pp_data.drop(pp_data.columns[dtde_cols], axis=1, inplace=True)
pp_data = pp_data.merge(de_data, how='inner',left_index=True, right_index=True, copy=False)

pp_data.drop('y',1,inplace=True)

X = pp_data.values
y = data_labels.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)



In [10]:

    
pca = PCA(n_components=2)
pca.fit_transform(pp_data)

pca_df = pd.DataFrame(pca.components_,columns=pp_data.columns,index = ['PC-1','PC-2'])

np.abs(pca_df.max(axis=0)).sort_values(ascending=False) # regardless of sign









    Out[10]:





pdays                            0.597829
euribor3m                        0.474219
emp.var.rate                     0.471998
nr.employed                      0.451640
cons.price.idx                   0.362964
previous                         0.301984
poutcome_nonexistent             0.116694
contact_telephone                0.113103
cons.conf.idx                    0.103478
campaign                         0.096786
contact_cellular                 0.094763
marital_single                   0.058200
month_jul                        0.050922
default_unknown                  0.044991
month_apr                        0.042105
month_jun                        0.037294
poutcome_success                 0.036876
month_may                        0.033442
default_no                       0.025134
duration                         0.024252
marital_married                  0.023555
housing_yes                      0.023131
job_blue-collar                  0.022197
month_aug                        0.019087
education_high.school            0.018277
housing_no                       0.017682
poutcome_failure                 0.016568
education_basic.9y               0.016017
month_oct                        0.013246
job_services                     0.011612
                                   ...   
day_of_week_thu                  0.007681
day_of_week_mon                  0.006958
job_technician                   0.006889
job_admin.                       0.006568
education_basic.4y               0.005839
day_of_week_wed                  0.004599
loan_yes                         0.003903
education_basic.6y               0.003695
month_dec                        0.003505
age                              0.003335
job_housemaid                    0.002746
marital_divorced                 0.002597
day_of_week_tue                  0.002570
education_professional.course    0.002451
day_of_week_fri                  0.001664
job_entrepreneur                 0.001642
job_self-employed                0.001626
job_management                   0.001620
job_unemployed                   0.001208
job_unknown                      0.000763
month_mar                        0.000437
loan_unknown                     0.000426
housing_unknown                  0.000426
job_student                      0.000425
education_university.degree      0.000338
education_unknown                0.000325
loan_no                          0.000176
marital_unknown                  0.000090
education_illiterate             0.000022
default_yes                      0.000011
dtype: float64



In [11]:

    
np.abs(pca_df.max(axis=0)).sort_values(ascending=False).plot()
plt.xticks(rotation='vertical')









    Out[11]:





(array([  0.,  10.,  20.,  30.,  40.,  50.,  60.,  70.]),
 <a list of 8 Text xticklabel objects>)



In [12]:

    
cor = pp_data.corr()
cor.loc[:,:] = np.tril(cor, k=-1) # below main lower triangle of an array
cor = cor.stack()
(cor[(cor > 0.55) | (cor < -0.55)]).sort_values()









    Out[12]:





contact_telephone     contact_cellular   -1.000000
default_unknown       default_no         -0.999780
housing_yes           housing_no         -0.952819
poutcome_success      pdays              -0.950700
loan_yes              loan_no            -0.915840
poutcome_nonexistent  previous           -0.878776
                      poutcome_failure   -0.853119
marital_single        marital_married    -0.773785
contact_cellular      cons.price.idx     -0.591474
previous              pdays              -0.587514
contact_telephone     cons.price.idx      0.591474
poutcome_failure      previous            0.682608
euribor3m             cons.price.idx      0.688230
cons.price.idx        emp.var.rate        0.775334
nr.employed           emp.var.rate        0.906970
                      euribor3m           0.945154
euribor3m             emp.var.rate        0.972245
loan_unknown          housing_unknown     1.000000
dtype: float64



In [ ]:

    
t_X = pp_data.ix[:,:10].values # lets only use the standard scaled data

# perform t-SNE embedding
tsne = TSNE(n_components=2, init='random', random_state=0)
Y = tsne.fit_transform(t_X)



In [ ]:

    
plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap=plt.cm.Set3,s=30,alpha=.8)



In [ ]:

	age	duration	campaign	pdays	previous	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed
count	41188.00000	41188.000000	41188.000000	41188.000000	41188.000000	41188.000000	41188.000000	41188.000000	41188.000000	41188.000000
mean	40.02406	258.285010	2.567593	962.475454	0.172963	0.081886	93.575664	-40.502600	3.621291	5167.035911
std	10.42125	259.279249	2.770014	186.910907	0.494901	1.570960	0.578840	4.628198	1.734447	72.251528
min	17.00000	0.000000	1.000000	0.000000	0.000000	-3.400000	92.201000	-50.800000	0.634000	4963.600000
25%	32.00000	102.000000	1.000000	999.000000	0.000000	-1.800000	93.075000	-42.700000	1.344000	5099.100000
50%	38.00000	180.000000	2.000000	999.000000	0.000000	1.100000	93.749000	-41.800000	4.857000	5191.000000
75%	47.00000	319.000000	3.000000	999.000000	0.000000	1.400000	93.994000	-36.400000	4.961000	5228.100000
max	98.00000	4918.000000	56.000000	999.000000	7.000000	1.400000	94.767000	-26.900000	5.045000	5228.100000