In [1]:
from sklearn.feature_selection import VarianceThreshold
X = [[0, 2, 0, 3],
[0, 1, 4, 3],
[0, 1, 1, 3]]
selector = VarianceThreshold()
selector.fit_transform(X)
Out[1]:
In [2]:
import pandas as pd
import seaborn as sns
%matplotlib inline
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
sns.heatmap(pd.DataFrame(X).corr(), cmap=sns.diverging_palette(220, 10, as_cmap=True));
In [3]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
X, y = load_iris(return_X_y=True)
print('Before feature selection:', X.shape)
selector = SelectKBest(f_classif, k=2).fit(X, y)
X_new = selector.transform(X)
print('After feature selection:', X_new.shape)
print('Scores:', selector.scores_)
Principal component analysis is a dimensionality reduction algorithm that we can use to find structure in our data.The main aim is to find surface onto which projection errros are minimized.This surface is a lower the dimensional subspace spanned by principal components of the data. These principal components are the direction along which the projection of the data onto that axis hae the maximum variance.The main component along which the data varies is called the principal axis of variation.
In [4]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
In [5]:
import gzip, pickle, sys
f = gzip.open('Datasets/mnist.pkl.gz', 'rb')
(input_train, output_train), (input_test, output_test), _ = pickle.load(f, encoding='bytes')
In [6]:
for i in range(4):
plt.subplot(2,2,i+1)
plt.imshow(input_train[i].reshape((28,28)), cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()
In [7]:
from sklearn.ensemble import RandomForestClassifier
In [8]:
randomforest = RandomForestClassifier(n_estimators=30)
In [9]:
randomforest.fit(input_train,output_train)
Out[9]:
In [10]:
from sklearn.metrics import classification_report
print(classification_report(output_test, randomforest.predict(input_test)))
In [11]:
from sklearn.decomposition import PCA
In [12]:
pca = PCA(n_components=500)
pca.fit(input_train)
Out[12]:
In [13]:
plt.figure(figsize=(12,6))
plt.plot(np.cumsum(pca.explained_variance_ratio_[0:500]),marker = 'o')
plt.show()
In [14]:
np.cumsum(pca.explained_variance_ratio_[0:500])[200]
Out[14]:
In [15]:
pca = PCA(n_components= 200)
pca.fit(input_train)
Out[15]:
In [16]:
x_train = pca.transform(input_train)
x_test = pca.transform(input_test)
In [17]:
for i in range(4):
plt.subplot(2,2,i+1)
plt.imshow(pca.inverse_transform(x_train)[i].reshape((28, 28)), cmap=plt.cm.gray_r, interpolation='nearest' )
plt.show()
In [18]:
randomforest = RandomForestClassifier(n_estimators=30)
In [19]:
randomforest.fit(x_train,output_train)
Out[19]:
In [20]:
from sklearn.metrics import classification_report
print(classification_report(output_test, randomforest.predict(x_test)))