In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, KernelPCA
%matplotlib inline

In [9]:
# Manual
# load data
df = pd.read_csv("train.csv.zip", index_col="id")
# get continous variables
df_cont = df[[c for c in df.columns if "cont" in c]]
df_cont.head()


Out[9]:
cont1 cont2 cont3 cont4 cont5 cont6 cont7 cont8 cont9 cont10 cont11 cont12 cont13 cont14
id
1 0.726300 0.245921 0.187583 0.789639 0.310061 0.718367 0.335060 0.30260 0.67135 0.83510 0.569745 0.594646 0.822493 0.714843
2 0.330514 0.737068 0.592681 0.614134 0.885834 0.438917 0.436585 0.60087 0.35127 0.43919 0.338312 0.366307 0.611431 0.304496
5 0.261841 0.358319 0.484196 0.236924 0.397069 0.289648 0.315545 0.27320 0.26076 0.32446 0.381398 0.373424 0.195709 0.774425
10 0.321594 0.555782 0.527991 0.373816 0.422268 0.440945 0.391128 0.31796 0.32128 0.44467 0.327915 0.321570 0.605077 0.602642
11 0.273204 0.159990 0.527991 0.473202 0.704268 0.178193 0.247408 0.24564 0.22089 0.21230 0.204687 0.202213 0.246011 0.432606

In [10]:
# check if the variables are correlated
sns.heatmap(df_cont.corr())
# get covariance and eigen -values/-vectors
cov = df_cont.cov()
eigvals, eigvecs = np.linalg.eigh(cov) # linalg.eigh returns values/vecs in reverse order
eigvals = eigvals[::-1]
eigvecs = eigvecs[:,::-1]
# whiten data
lambda_sqrt_inverse = np.linalg.inv(np.sqrt(np.diag(eigvals)))
U = eigvecs
df_cont_cent = df_cont.sub(df_cont.mean())
df_cont_white = np.dot(np.dot(lambda_sqrt_inverse, U.T), df_cont_cent.T)
df_cont_white = pd.DataFrame(df_cont_white.T, index=df_cont.index ,columns=df_cont.columns)



In [45]:
# plotting data with 2 PCAs
df_pca2 = df_cont_white.dot(eigvecs[:,:2])
sns.jointplot(x=0, y=1, data=df_pca2)


Out[45]:
<seaborn.axisgrid.JointGrid at 0x7f7ebb80b690>

In [46]:
# pairplot of first 6 PCs
df_pca6 = df_cont_white.dot(eigvecs[:,:6])
sns.pairplot(df_pca6)
highloss_idx = df.loc[df['loss'] >= 25000].index.tolist()
sns.pairplot(df_pca6.ix[highloss_idx,:])


Out[46]:
<seaborn.axisgrid.PairGrid at 0x7f7eb6ff4bd0>

In [30]:
plt.figure(figsize=(20,10))
plt.scatter(x=df_pca2.ix[:,0], y=df_pca2.ix[:,1], c=df['loss'], cmap='RdBu')
plt.show()



In [41]:
plt.figure(figsize=(20,10))
plt.scatter(x=df_pca6.ix[highloss_idx,2], y=df_pca6.ix[highloss_idx,3], c='g')
plt.show()
sns.jointplot(x=2, y=3, data=df_pca6)


Out[41]:
<seaborn.axisgrid.PairGrid at 0x7f7ebeffea10>

In [7]:
kpca = KernelPCA(n_components=6, kernel='rbf')
df_cont_kpca = kpca.fit_transform(df_cont)
df_cont_kpca.shape


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-7-5a65e39b4e18> in <module>()
      1 kpca = KernelPCA(n_components=6, kernel='rbf')
----> 2 df_cont_kpca = kpca.fit_transform(df_cont)
      3 df_cont_kpca.shape

/projects/sage/sage-7.3/local/lib/python2.7/site-packages/sklearn/decomposition/kernel_pca.py in fit_transform(self, X, y, **params)
    224         X_new: array-like, shape (n_samples, n_components)
    225         """
--> 226         self.fit(X, **params)
    227 
    228         X_transformed = self.alphas_ * np.sqrt(self.lambdas_)

/projects/sage/sage-7.3/local/lib/python2.7/site-packages/sklearn/decomposition/kernel_pca.py in fit(self, X, y)
    200             Returns the instance itself.
    201         """
--> 202         K = self._get_kernel(X)
    203         self._fit_transform(K)
    204 

/projects/sage/sage-7.3/local/lib/python2.7/site-packages/sklearn/decomposition/kernel_pca.py in _get_kernel(self, X, Y)
    133                       "coef0": self.coef0}
    134         return pairwise_kernels(X, Y, metric=self.kernel,
--> 135                                 filter_params=True, **params)
    136 
    137     def _fit_transform(self, K):

/projects/sage/sage-7.3/local/lib/python2.7/site-packages/sklearn/metrics/pairwise.py in pairwise_kernels(X, Y, metric, filter_params, n_jobs, **kwds)
   1345         raise ValueError("Unknown kernel %r" % metric)
   1346 
-> 1347     return _parallel_pairwise(X, Y, func, n_jobs, **kwds)

/projects/sage/sage-7.3/local/lib/python2.7/site-packages/sklearn/metrics/pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
   1052     if n_jobs == 1:
   1053         # Special case to avoid picklability checks in delayed
-> 1054         return func(X, Y, **kwds)
   1055 
   1056     # TODO: in some cases, backend='threading' may be appropriate

/projects/sage/sage-7.3/local/lib/python2.7/site-packages/sklearn/metrics/pairwise.py in rbf_kernel(X, Y, gamma)
    807         gamma = 1.0 / X.shape[1]
    808 
--> 809     K = euclidean_distances(X, Y, squared=True)
    810     K *= -gamma
    811     np.exp(K, K)    # exponentiate K in-place

/projects/sage/sage-7.3/local/lib/python2.7/site-packages/sklearn/metrics/pairwise.py in euclidean_distances(X, Y, Y_norm_squared, squared, X_norm_squared)
    229         YY = row_norms(Y, squared=True)[np.newaxis, :]
    230 
--> 231     distances = safe_sparse_dot(X, Y.T, dense_output=True)
    232     distances *= -2
    233     distances += XX

/projects/sage/sage-7.3/local/lib/python2.7/site-packages/sklearn/utils/extmath.py in safe_sparse_dot(a, b, dense_output)
    182         return ret
    183     else:
--> 184         return fast_dot(a, b)
    185 
    186 

MemoryError: 

In [ ]: