In [1]:
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn .datasets import fetch_olivetti_faces
In [2]:
faces = fetch_olivetti_faces()
faces.DESCR
Out[2]:
In [3]:
faces.keys()
Out[3]:
In [4]:
faces.images.shape
Out[4]:
In [5]:
faces.data.shape
Out[5]:
In [6]:
faces.target.shape
Out[6]:
In [7]:
np.max(faces.data)
Out[7]:
In [8]:
np.min(faces.data)
Out[8]:
In [9]:
np.median(faces.data)
Out[9]:
In [10]:
def print_faces(images , target , top_n):
fig = plt.figure(figsize=(20,20))
for i in range(top_n):
p = fig.add_subplot(20,20,i+1,xticks=[],yticks=[])
p.imshow(images[i],cmap=plt.cm.bone)
p.text(0,14,str(target[i]))
p.text(0,59,str(i))
print_faces(faces.images,faces.target,20)
plt.show()
In [11]:
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score,KFold
from scipy.stats import sem
svc_1 = SVC(kernel='linear')
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(faces.data, faces.target, test_size=0.25, random_state=0)
In [12]:
def evaluate_cross_validation(clf, X, y, K):
cv = KFold(len(y) , K, shuffle =True, random_state = 0)
scores = cross_val_score(clf,X,y,cv=cv)
print scores
evaluate_cross_validation(svc_1,X_train,y_train,5)
In [13]:
from sklearn import metrics
def train_and_test(clf, X_train, X_test, y_train, y_test):
clf.fit(X_train, y_train)
print "Accuracy on training Set"
print clf.score(X_train, y_train)
print "Accuracy on testing Set"
print clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
print "Classification Report"
print metrics.classification_report(y_test, y_pred)
print "Confudion Matrix"
print metrics.confusion_matrix(y_test, y_pred)
train_and_test(svc_1, X_train, X_test, y_train, y_test)
In [14]:
glasses = [
(10, 19), (30, 32), (37, 38), (50, 59), (63, 64),
(69, 69), (120, 121), (124, 129), (130, 139), (160, 161),
(164, 169), (180, 182), (185, 185), (189, 189), (190, 192),
(194, 194), (196, 199), (260, 269), (270, 279), (300, 309),
(330, 339), (358, 359), (360, 369)]
In [15]:
def create_target(segments):
y = np.zeros(faces.target.shape[0])
for (start, end) in segments:
y[start:end+1] = 1
return y
target_glasses = create_target(glasses)
In [16]:
X_train, X_test, y_train, y_test = train_test_split(faces.data, target_glasses, test_size=0.25, random_state=0)
In [17]:
svc_2 = SVC(kernel='linear')
In [18]:
evaluate_cross_validation(svc_2, X_train, y_train, 5)
In [19]:
train_and_test(svc_2, X_train, X_test, y_train, y_test)
In [20]:
X_test = faces.data[30:40]
y_test = target_glasses[30:40]
y_test.shape
Out[20]:
In [21]:
select = np.ones(target_glasses.shape[0])
In [22]:
select[30:40] = 0
In [23]:
X_train = faces.data[select == 1]
y_train = target_glasses[select == 1]
In [24]:
y_train.shape
Out[24]:
In [25]:
svc_3 = SVC(kernel='linear')
In [26]:
train_and_test(svc_3, X_train, X_test, y_train, y_test)
In [27]:
from sklearn.datasets import fetch_20newsgroups
In [28]:
news = fetch_20newsgroups(subset='all')
In [29]:
print type(news.data), type(news.target), type(news.target_names)
In [30]:
print news.target_names
In [31]:
len(news.data)
Out[31]:
In [32]:
len(news.target)
Out[32]:
In [33]:
news.data[0] #Content of the data at 0th index
Out[33]:
In [34]:
news.target[0], news.target_names[news.target[0]] # Target_Name
Out[34]:
In [35]:
SPLIT_PERC = .75
split_size = int(len(news.data)*SPLIT_PERC)
X_train = news.data[:split_size]
X_test = news.data[split_size:]
y_train = news.target[:split_size]
y_test = news.target[split_size:]
In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
In [37]:
clf_1 = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])
In [38]:
clf_2 = Pipeline([('vect', HashingVectorizer(non_negative=True)), ('clf', MultinomialNB())])
In [39]:
clf_3 = Pipeline([('vect', TfidfVectorizer()), ('clf', MultinomialNB())])
In [40]:
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem
In [41]:
clfs = [clf_1, clf_2, clf_3]
for clf in clfs:
print clf
evaluate_cross_validation(clf, news.data, news.target, 5)
In [ ]: