IRIS: random forests and PCA


In [ ]:
from sklearn import datasets
iris = datasets.load_iris()

import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
X = iris.data
y = iris.target

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .3, random_state = 0)

In [5]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

In [6]:
X_train[0:5]


Out[6]:
array([[ 5. ,  2. ,  3.5,  1. ],
       [ 6.5,  3. ,  5.5,  1.8],
       [ 6.7,  3.3,  5.7,  2.5],
       [ 6. ,  2.2,  5. ,  1.5],
       [ 6.7,  2.5,  5.8,  1.8]])

In [7]:
X_train_std[0:5]


Out[7]:
array([[-1.02366372, -2.37846268, -0.18295039, -0.29145882],
       [ 0.69517462, -0.10190314,  0.93066067,  0.73721938],
       [ 0.92435306,  0.58106472,  1.04202177,  1.6373128 ],
       [ 0.1222285 , -1.92315077,  0.6522579 ,  0.35146505],
       [ 0.92435306, -1.24018291,  1.09770233,  0.73721938]])

In [8]:
import numpy as np
cov_mat = np.cov(X_train_std.T)#dxd dim matrix where d is the number of dimensions
cov_mat


Out[8]:
array([[ 1.00961538, -0.03658858,  0.89282533,  0.84055197],
       [-0.03658858,  1.00961538, -0.3421826 , -0.28950335],
       [ 0.89282533, -0.3421826 ,  1.00961538,  0.97784588],
       [ 0.84055197, -0.28950335,  0.97784588,  1.00961538]])

In [9]:
eigenvals, eigenvecs = np.linalg.eig(cov_mat)
print("eigenvals = \n ",eigenvals, "\n")
print("eigenvecs = \n",eigenvecs, "\n")


eigenvals = 
  [ 2.89976476  0.98710977  0.13476983  0.01681717] 

eigenvecs = 
 [[  5.35500399e-01  -3.25611548e-01  -7.32041268e-01   2.67080558e-01]
 [ -2.04195389e-01  -9.44913832e-01   2.30263378e-01  -1.11448955e-01]
 [  5.86174262e-01   9.09058855e-04   1.37061857e-01  -7.98506703e-01]
 [  5.72663340e-01  -3.33787741e-02   6.26345277e-01   5.27858078e-01]] 


In [10]:
tot = sum(eigenvals)
var_exp = [(i/tot) for i in sorted(eigenvals, reverse = True)]#'True' is case sensitive
print(var_exp)
cum_var_exp = np.cumsum(var_exp)
print(cum_var_exp)


[0.71803698889091139, 0.24442718138870986, 0.033371577775729543, 0.0041642519446492875]
[ 0.71803699  0.96246417  0.99583575  1.        ]

In [11]:
#plot
import matplotlib.pyplot as plt
plt.bar(range(1,5), var_exp, alpha = .5, align = 'center', label = "Individual Explained Variance")
plt.step(range(1,5), cum_var_exp, alpha = .5, where = 'mid', label = "Cumulative Explained Variance")

#add labels
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal Components')

#add legend
plt.legend(loc = 'best')
plt.show()

In [ ]:


In [12]:
iris.feature_names#order isn't preserved in PCA since it's an unsupervised algorithm


Out[12]:
['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [13]:
#so let's explore a supervised learning algorithm: Random Forests

In [14]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 1000, random_state = 0, n_jobs = -1)
#n_jobs = number of jobs to run in parallel, -1 set to number of cores

In [15]:
forest.fit(X_train, y_train)


Out[15]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [16]:
importances = forest.feature_importances_

In [17]:
indices = np.argsort(importances)[::-1]

In [18]:
for f in range(X_train.shape[1]):
    print(f+1, iris.feature_names[f], importances[indices[f]])


1 sepal length (cm) 0.463044116595
2 sepal width (cm) 0.402157509467
3 petal length (cm) 0.108614960645
4 petal width (cm) 0.0261834132934

In [45]:
#visualization
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]), importances[indices], align = 'center', alpha = .7)
plt.xticks(range(X_train.shape[1]), iris.feature_names, rotation = 90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()

In [ ]:
#should check the variances of each feature. Petal width and length may have small variances (after standardization)
#compared to the sepal lengths and widths

In [ ]:


In [30]:
#finishing PCA
#FEATURE TRANFORMATION
eigenpairs = [(np.abs(eigenvals[i]), eigenvecs[:,i]) for i in range(len(eigenvals))]

#sort these eigenpairs by decreasing eigenvalues
eigenpairs.sort(reverse = True)

In [ ]:


In [28]:
eigenvecs[:,0]#each column is an eigenvector


Out[28]:
array([ 0.5355004 , -0.20419539,  0.58617426,  0.57266334])

In [33]:
eigenpairs


Out[33]:
[(2.8997647628286778,
  array([ 0.5355004 , -0.20419539,  0.58617426,  0.57266334])),
 (0.9871097709928659,
  array([ -3.25611548e-01,  -9.44913832e-01,   9.09058855e-04,
          -3.33787741e-02])),
 (0.13476983332506148,
  array([-0.73204127,  0.23026338,  0.13706186,  0.62634528])),
 (0.0168171713149298,
  array([ 0.26708056, -0.11144895, -0.7985067 ,  0.52785808]))]

In [34]:
w = np.hstack((eigenpairs[0][1][:,np.newaxis], eigenpairs[1][1][:,np.newaxis]))
print('Matrix W:\n', w)


Matrix W:
 [[  5.35500399e-01  -3.25611548e-01]
 [ -2.04195389e-01  -9.44913832e-01]
 [  5.86174262e-01   9.09058855e-04]
 [  5.72663340e-01  -3.33787741e-02]]

In [37]:
#project the training set onto the 2 most importance dimensions
X_train_pca = X_train_std.dot(w)

In [47]:
#visualize
colors = ['r','b','g']
markers = ['s','x','o']

In [48]:
for l,c,m in zip(np.unique(y_train), colors, markers):
    plt.scatter(X_train_pca[y_train==1,0], X_train_pca[y_train==1, 1], c = c, label = l, marker = m)

In [49]:
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc = 'lower left')
plt.show()

In [51]:
y_train


Out[51]:
array([1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1, 2,
       0, 0, 2, 1, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0, 0, 2, 2,
       0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       2, 1, 2, 1, 0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1, 1, 0, 1, 2,
       2, 0, 1, 1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

In [19]:
b = np.arange(25).reshape(5,5)
print(b)
b[:, 2:4]


[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]]
Out[19]:
array([[ 2,  3],
       [ 7,  8],
       [12, 13],
       [17, 18],
       [22, 23]])