In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [8]:
df = pd.read_csv('Downloads/HR_comma_sep.csv')
In [9]:
columns_names=df.columns.tolist()
print("Columns names:")
print(columns_names)
In [10]:
df.head()
Out[10]:
In [11]:
df.corr()
Out[11]:
In [12]:
correlation = df.corr()
plt.figure(figsize=(10,10))
sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='cubehelix')
plt.title('Correlation between different fearures')
Out[12]:
In [13]:
df['sales'].unique()
Out[13]:
In [14]:
df_drop=df.drop(labels=['sales','salary'],axis=1)
df_drop.head()
Out[14]:
In [16]:
cols = df_drop.columns.tolist()
cols
Out[16]:
In [17]:
df_drop = df_drop.reindex(columns= cols)
In [26]:
X = df_drop.iloc[:,0:8].values
y = df_drop.iloc[:,-1].values
X,y
Out[26]:
In [27]:
y
Out[27]:
In [28]:
np.shape(X)
Out[28]:
In [29]:
np.shape(y)
Out[29]:
In [30]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
In [31]:
X_std
Out[31]:
In [32]:
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)
In [33]:
X_cov = np.cov(X_std.T)
In [34]:
X_cov
Out[34]:
In [43]:
plt.figure(figsize=(8,8))
sns.heatmap(cov_mat, vmax=1, square=True,annot=True,cmap='RdBu_r')
plt.title('Correlation between different features')
Out[43]:
In [44]:
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)
In [45]:
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
In [47]:
eig_pairs
Out[47]:
In [48]:
eig_pairs.sort(key=lambda x: x[0], reverse=True)
In [49]:
eig_pairs
Out[49]:
In [51]:
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
tot,var_exp
Out[51]:
In [52]:
with plt.style.context('dark_background'):
plt.figure(figsize=(6, 4))
plt.bar(range(7), var_exp, alpha=0.5, align='center',
label='individual explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
In [ ]: