IRIS: random forests and PCA



In [ ]:

    
from sklearn import datasets
iris = datasets.load_iris()

import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler



In [2]:

    
X = iris.data
y = iris.target



In [4]:

    
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .3, random_state = 0)



In [5]:

    
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)



In [6]:

    
X_train[0:5]









    Out[6]:





array([[ 5. ,  2. ,  3.5,  1. ],
       [ 6.5,  3. ,  5.5,  1.8],
       [ 6.7,  3.3,  5.7,  2.5],
       [ 6. ,  2.2,  5. ,  1.5],
       [ 6.7,  2.5,  5.8,  1.8]])



In [7]:

    
X_train_std[0:5]









    Out[7]:





array([[-1.02366372, -2.37846268, -0.18295039, -0.29145882],
       [ 0.69517462, -0.10190314,  0.93066067,  0.73721938],
       [ 0.92435306,  0.58106472,  1.04202177,  1.6373128 ],
       [ 0.1222285 , -1.92315077,  0.6522579 ,  0.35146505],
       [ 0.92435306, -1.24018291,  1.09770233,  0.73721938]])



In [8]:

    
import numpy as np
cov_mat = np.cov(X_train_std.T)#dxd dim matrix where d is the number of dimensions
cov_mat









    Out[8]:





array([[ 1.00961538, -0.03658858,  0.89282533,  0.84055197],
       [-0.03658858,  1.00961538, -0.3421826 , -0.28950335],
       [ 0.89282533, -0.3421826 ,  1.00961538,  0.97784588],
       [ 0.84055197, -0.28950335,  0.97784588,  1.00961538]])



In [9]:

    
eigenvals, eigenvecs = np.linalg.eig(cov_mat)
print("eigenvals = \n ",eigenvals, "\n")
print("eigenvecs = \n",eigenvecs, "\n")









    



eigenvals = 
  [ 2.89976476  0.98710977  0.13476983  0.01681717] 

eigenvecs = 
 [[  5.35500399e-01  -3.25611548e-01  -7.32041268e-01   2.67080558e-01]
 [ -2.04195389e-01  -9.44913832e-01   2.30263378e-01  -1.11448955e-01]
 [  5.86174262e-01   9.09058855e-04   1.37061857e-01  -7.98506703e-01]
 [  5.72663340e-01  -3.33787741e-02   6.26345277e-01   5.27858078e-01]]



In [10]:

    
tot = sum(eigenvals)
var_exp = [(i/tot) for i in sorted(eigenvals, reverse = True)]#'True' is case sensitive
print(var_exp)
cum_var_exp = np.cumsum(var_exp)
print(cum_var_exp)









    



[0.71803698889091139, 0.24442718138870986, 0.033371577775729543, 0.0041642519446492875]
[ 0.71803699  0.96246417  0.99583575  1.        ]



In [11]:

    
#plot
import matplotlib.pyplot as plt
plt.bar(range(1,5), var_exp, alpha = .5, align = 'center', label = "Individual Explained Variance")
plt.step(range(1,5), cum_var_exp, alpha = .5, where = 'mid', label = "Cumulative Explained Variance")

#add labels
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal Components')

#add legend
plt.legend(loc = 'best')
plt.show()



In [ ]:



In [12]:

    
iris.feature_names#order isn't preserved in PCA since it's an unsupervised algorithm









    Out[12]:





['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']



In [13]:

    
#so let's explore a supervised learning algorithm: Random Forests



In [14]:

    
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 1000, random_state = 0, n_jobs = -1)
#n_jobs = number of jobs to run in parallel, -1 set to number of cores



In [15]:

    
forest.fit(X_train, y_train)









    Out[15]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)



In [16]:

    
importances = forest.feature_importances_



In [17]:

    
indices = np.argsort(importances)[::-1]



In [18]:

    
for f in range(X_train.shape[1]):
    print(f+1, iris.feature_names[f], importances[indices[f]])









    



1 sepal length (cm) 0.463044116595
2 sepal width (cm) 0.402157509467
3 petal length (cm) 0.108614960645
4 petal width (cm) 0.0261834132934



In [45]:

    
#visualization
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]), importances[indices], align = 'center', alpha = .7)
plt.xticks(range(X_train.shape[1]), iris.feature_names, rotation = 90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()



In [ ]:

    
#should check the variances of each feature. Petal width and length may have small variances (after standardization)
#compared to the sepal lengths and widths



In [ ]:



In [30]:

    
#finishing PCA
#FEATURE TRANFORMATION
eigenpairs = [(np.abs(eigenvals[i]), eigenvecs[:,i]) for i in range(len(eigenvals))]

#sort these eigenpairs by decreasing eigenvalues
eigenpairs.sort(reverse = True)



In [ ]:



In [28]:

    
eigenvecs[:,0]#each column is an eigenvector









    Out[28]:





array([ 0.5355004 , -0.20419539,  0.58617426,  0.57266334])



In [33]:

    
eigenpairs









    Out[33]:





[(2.8997647628286778,
  array([ 0.5355004 , -0.20419539,  0.58617426,  0.57266334])),
 (0.9871097709928659,
  array([ -3.25611548e-01,  -9.44913832e-01,   9.09058855e-04,
          -3.33787741e-02])),
 (0.13476983332506148,
  array([-0.73204127,  0.23026338,  0.13706186,  0.62634528])),
 (0.0168171713149298,
  array([ 0.26708056, -0.11144895, -0.7985067 ,  0.52785808]))]



In [34]:

    
w = np.hstack((eigenpairs[0][1][:,np.newaxis], eigenpairs[1][1][:,np.newaxis]))
print('Matrix W:\n', w)









    



Matrix W:
 [[  5.35500399e-01  -3.25611548e-01]
 [ -2.04195389e-01  -9.44913832e-01]
 [  5.86174262e-01   9.09058855e-04]
 [  5.72663340e-01  -3.33787741e-02]]



In [37]:

    
#project the training set onto the 2 most importance dimensions
X_train_pca = X_train_std.dot(w)



In [47]:

    
#visualize
colors = ['r','b','g']
markers = ['s','x','o']



In [48]:

    
for l,c,m in zip(np.unique(y_train), colors, markers):
    plt.scatter(X_train_pca[y_train==1,0], X_train_pca[y_train==1, 1], c = c, label = l, marker = m)



In [49]:

    
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc = 'lower left')
plt.show()



In [51]:

    
y_train









    Out[51]:





array([1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1, 2,
       0, 0, 2, 1, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0, 0, 2, 2,
       0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       2, 1, 2, 1, 0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1, 1, 0, 1, 2,
       2, 0, 1, 1, 1, 1, 0, 0, 0, 2, 1, 2, 0])



In [19]:

    
b = np.arange(25).reshape(5,5)
print(b)
b[:, 2:4]









    



[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]]






    Out[19]:





array([[ 2,  3],
       [ 7,  8],
       [12, 13],
       [17, 18],
       [22, 23]])