In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, KernelPCA
%matplotlib inline
In [9]:
# Manual
# load data
df = pd.read_csv("train.csv.zip", index_col="id")
# get continous variables
df_cont = df[[c for c in df.columns if "cont" in c]]
df_cont.head()
Out[9]:
In [10]:
# check if the variables are correlated
sns.heatmap(df_cont.corr())
# get covariance and eigen -values/-vectors
cov = df_cont.cov()
eigvals, eigvecs = np.linalg.eigh(cov) # linalg.eigh returns values/vecs in reverse order
eigvals = eigvals[::-1]
eigvecs = eigvecs[:,::-1]
# whiten data
lambda_sqrt_inverse = np.linalg.inv(np.sqrt(np.diag(eigvals)))
U = eigvecs
df_cont_cent = df_cont.sub(df_cont.mean())
df_cont_white = np.dot(np.dot(lambda_sqrt_inverse, U.T), df_cont_cent.T)
df_cont_white = pd.DataFrame(df_cont_white.T, index=df_cont.index ,columns=df_cont.columns)
In [45]:
# plotting data with 2 PCAs
df_pca2 = df_cont_white.dot(eigvecs[:,:2])
sns.jointplot(x=0, y=1, data=df_pca2)
Out[45]:
In [46]:
# pairplot of first 6 PCs
df_pca6 = df_cont_white.dot(eigvecs[:,:6])
sns.pairplot(df_pca6)
highloss_idx = df.loc[df['loss'] >= 25000].index.tolist()
sns.pairplot(df_pca6.ix[highloss_idx,:])
Out[46]:
In [30]:
plt.figure(figsize=(20,10))
plt.scatter(x=df_pca2.ix[:,0], y=df_pca2.ix[:,1], c=df['loss'], cmap='RdBu')
plt.show()