In [1]:
%pylab inline
In [2]:
import pandas as pd
In [3]:
# load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
features = ['sepal_length','sepal_width','petal_length','petal_width','target']
df = pd.read_csv(url, names = features)
df.head() # show 5 first entries
Out[3]:
The mean of every feature must be zero, and the standard deviation 1
In [4]:
from sklearn.preprocessing import StandardScaler
In [5]:
# extract features
features = ['sepal_length','sepal_width','petal_length','petal_width']
x = df.loc[:, features].values
y = df.loc[:,['target']].values
In [6]:
# Standarize features
stdx = StandardScaler().fit_transform(x)
stdDf = pd.DataFrame(data = stdx, columns = features)
pd.concat([stdDf, df['target']], axis=1).head()
Out[6]:
In [7]:
from sklearn.decomposition import PCA
In [8]:
pca = PCA(n_components = 2)
principalComponents = pca.fit_transform(stdx)
pcDf = pd.DataFrame(data = principalComponents, columns =['PC1', 'PC2'])
finalDf = pd.concat([pcDf, df['target']], axis=1)
finalDf.head()
Out[8]:
In [11]:
var1, var2 = pca.explained_variance_ratio_
print('The first component contains %2.4f %% of the variance'%var1)
print('The second component containts %2.4f %% of the variance'%var2)
print('Total variance explained %2.4f %% '%(var1+var2))
In [22]:
fig = plt.figure(figsize = (4,4))
ax = fig.add_subplot(111)
xlabel = 'Component 1 (%2.2f %% $\sigma^2$)'%var1
ylabel = 'Component 2 (%2.2f %% $\sigma^2$)'%var2
ax.set_xlabel(xlabel, fontsize = 12)
ax.set_ylabel(ylabel, fontsize = 12)
ax.set_title('Two component analysis', fontsize = 15)
mytargets = np.unique(df['target'].values).tolist()
colors = ['r', 'g', 'b']
for target, color in zip(mytargets,colors):
mytarget = finalDf['target'] == target
ax.scatter(finalDf.loc[mytarget, 'PC1']
, finalDf.loc[mytarget, 'PC2']
, c = color
, s = 10)
ax.legend(mytargets)
#ax.grid()
Out[22]:
In [ ]: