In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
In [6]:
df = pd.read_csv('/Users/ajmendez/Downloads/bank-additional/bank-additional-full.csv',
sep=';')
In [7]:
df.describe()
Out[7]:
In [8]:
f_list = ['education','job','marital','contact',
'campaign','duration','loan','poutcome']
sorted([(feature,metrics.adjusted_mutual_info_score(df[feature],df['y']))
for feature in f_list],
key=lambda x: x[1], reverse=True)
Out[8]:
In [9]:
test_size=.33
pp_data = df.copy()
data_labels = pd.Series([0 if x == 'no' else 1 for x in df.ix[:,-1]])
# Scale numeric data
dts_cols = [0,10, 11, 12,13,15,16,17,18,19]
data_to_scale = pp_data.iloc[:, dts_cols].astype(np.float) # change int to float
scaler = preprocessing.StandardScaler().fit(data_to_scale)
pp_data.iloc[:, dts_cols] = scaler.transform(data_to_scale)
# Create dummy encoding for categorical data
dtde_cols = [1, 2, 3, 4, 5, 6, 7, 8, 9, 14]
data_to_de = pp_data.iloc[:,dtde_cols]
de_data = pd.get_dummies(data_to_de)
pp_data.drop(pp_data.columns[dtde_cols], axis=1, inplace=True)
pp_data = pp_data.merge(de_data, how='inner',left_index=True, right_index=True, copy=False)
pp_data.drop('y',1,inplace=True)
X = pp_data.values
y = data_labels.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
In [10]:
pca = PCA(n_components=2)
pca.fit_transform(pp_data)
pca_df = pd.DataFrame(pca.components_,columns=pp_data.columns,index = ['PC-1','PC-2'])
np.abs(pca_df.max(axis=0)).sort_values(ascending=False) # regardless of sign
Out[10]:
In [11]:
np.abs(pca_df.max(axis=0)).sort_values(ascending=False).plot()
plt.xticks(rotation='vertical')
Out[11]:
In [12]:
cor = pp_data.corr()
cor.loc[:,:] = np.tril(cor, k=-1) # below main lower triangle of an array
cor = cor.stack()
(cor[(cor > 0.55) | (cor < -0.55)]).sort_values()
Out[12]:
In [ ]:
t_X = pp_data.ix[:,:10].values # lets only use the standard scaled data
# perform t-SNE embedding
tsne = TSNE(n_components=2, init='random', random_state=0)
Y = tsne.fit_transform(t_X)
In [ ]:
plt.scatter(Y[:, 0], Y[:, 1], c=y, cmap=plt.cm.Set3,s=30,alpha=.8)
In [ ]: