IRIS: random forests and PCA
In [ ]:
from sklearn import datasets
iris = datasets.load_iris()
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
In [2]:
X = iris.data
y = iris.target
In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .3, random_state = 0)
In [5]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)
In [6]:
X_train[0:5]
Out[6]:
In [7]:
X_train_std[0:5]
Out[7]:
In [8]:
import numpy as np
cov_mat = np.cov(X_train_std.T)#dxd dim matrix where d is the number of dimensions
cov_mat
Out[8]:
In [9]:
eigenvals, eigenvecs = np.linalg.eig(cov_mat)
print("eigenvals = \n ",eigenvals, "\n")
print("eigenvecs = \n",eigenvecs, "\n")
In [10]:
tot = sum(eigenvals)
var_exp = [(i/tot) for i in sorted(eigenvals, reverse = True)]#'True' is case sensitive
print(var_exp)
cum_var_exp = np.cumsum(var_exp)
print(cum_var_exp)
In [11]:
#plot
import matplotlib.pyplot as plt
plt.bar(range(1,5), var_exp, alpha = .5, align = 'center', label = "Individual Explained Variance")
plt.step(range(1,5), cum_var_exp, alpha = .5, where = 'mid', label = "Cumulative Explained Variance")
#add labels
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal Components')
#add legend
plt.legend(loc = 'best')
plt.show()
In [ ]:
In [12]:
iris.feature_names#order isn't preserved in PCA since it's an unsupervised algorithm
Out[12]:
In [13]:
#so let's explore a supervised learning algorithm: Random Forests
In [14]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 1000, random_state = 0, n_jobs = -1)
#n_jobs = number of jobs to run in parallel, -1 set to number of cores
In [15]:
forest.fit(X_train, y_train)
Out[15]:
In [16]:
importances = forest.feature_importances_
In [17]:
indices = np.argsort(importances)[::-1]
In [18]:
for f in range(X_train.shape[1]):
print(f+1, iris.feature_names[f], importances[indices[f]])
In [45]:
#visualization
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]), importances[indices], align = 'center', alpha = .7)
plt.xticks(range(X_train.shape[1]), iris.feature_names, rotation = 90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()
In [ ]:
#should check the variances of each feature. Petal width and length may have small variances (after standardization)
#compared to the sepal lengths and widths
In [ ]:
In [30]:
#finishing PCA
#FEATURE TRANFORMATION
eigenpairs = [(np.abs(eigenvals[i]), eigenvecs[:,i]) for i in range(len(eigenvals))]
#sort these eigenpairs by decreasing eigenvalues
eigenpairs.sort(reverse = True)
In [ ]:
In [28]:
eigenvecs[:,0]#each column is an eigenvector
Out[28]:
In [33]:
eigenpairs
Out[33]:
In [34]:
w = np.hstack((eigenpairs[0][1][:,np.newaxis], eigenpairs[1][1][:,np.newaxis]))
print('Matrix W:\n', w)
In [37]:
#project the training set onto the 2 most importance dimensions
X_train_pca = X_train_std.dot(w)
In [47]:
#visualize
colors = ['r','b','g']
markers = ['s','x','o']
In [48]:
for l,c,m in zip(np.unique(y_train), colors, markers):
plt.scatter(X_train_pca[y_train==1,0], X_train_pca[y_train==1, 1], c = c, label = l, marker = m)
In [49]:
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc = 'lower left')
plt.show()
In [51]:
y_train
Out[51]:
In [19]:
b = np.arange(25).reshape(5,5)
print(b)
b[:, 2:4]
Out[19]: