Sveučilište u Zagrebu
Fakultet elektrotehnike i računarstva
http://www.fer.unizg.hr/predmet/su
Ak. god. 2015./2016.
(c) 2015 Jan Šnajder
Verzija: 0.1 (2015-12-19)
In [2]:
# Učitaj osnovne biblioteke...
import scipy as sp
import sklearn
import pandas as pd
%pylab inline
In [42]:
y_test = sp.random.choice((0,1), size=10); y_test
Out[42]:
In [50]:
y_pred = sp.random.choice((0,1), size=10); y_pred
Out[50]:
In [105]:
def cm(y_true, y_pred):
tp = 0
fp = 0
fn = 0
tn = 0
for (t, p) in zip(y_true, y_pred):
if t == 0 and p == 1: fp += 1
elif t == 1 and p == 0: fn += 1
elif t == 1 and p == 1: tp += 1
else: tn += 1
return sp.array([[tp, fp], [fn, tn]])
In [106]:
cm(y_test, y_pred)
Out[106]:
In [102]:
from sklearn.metrics import confusion_matrix
In [103]:
confusion_matrix(y_test, y_pred)
Out[103]:
In [104]:
confusion_matrix(y_test, y_pred, labels=[1,0])
Out[104]:
In [107]:
cm(y_test, y_pred)
Out[107]:
In [121]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
In [122]:
accuracy_score(y_test, y_pred)
Out[122]:
In [94]:
precision_score(y_test, y_pred)
Out[94]:
In [112]:
recall_score(y_test, y_pred)
Out[112]:
In [604]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
titanic_df = pd.read_csv("../data/titanic-train.csv")
titanic_df.drop(['PassengerId'], axis=1, inplace=True)
titanic_df1 = titanic_df[['Pclass', 'Sex', 'Age','Survived']]
titanic_X = titanic_df[['Pclass', 'Sex', 'Age']].as_matrix()
titanic_y = titanic_df['Survived'].as_matrix()
le = LabelEncoder()
titanic_X[:,1] = le.fit_transform(titanic_X[:,1])
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
titanic_X = imp.fit_transform(titanic_X)
In [605]:
titanic_X
Out[605]:
In [497]:
titanic_y
Out[497]:
In [606]:
shape(titanic_X), shape(titanic_y)
Out[606]:
In [607]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(titanic_X, titanic_y, train_size=2.0/3, random_state=42)
In [608]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1)
lr.fit(X_train, y_train)
Out[608]:
In [609]:
lr.predict(X_train)
Out[609]:
In [610]:
y_pred_lr = lr.predict(X_test); y_pred_lr
Out[610]:
In [611]:
y_test
Out[611]:
In [612]:
cm(y_test, y_pred_lr)
Out[612]:
In [613]:
accuracy_score(y_test, y_pred_lr)
Out[613]:
In [614]:
lr.score(X_test, y_test)
Out[614]:
In [615]:
lr.score(X_train, y_train)
Out[615]:
In [616]:
precision_score(y_test, y_pred_lr, pos_label=1)
Out[616]:
In [617]:
recall_score(y_test, y_pred_lr, pos_label=1)
Out[617]:
In [618]:
from sklearn.svm import SVC
svm = SVC(C=1)
svm.fit(X_train, y_train)
Out[618]:
In [619]:
svm.score(X_test, y_test)
Out[619]:
In [620]:
y_pred_svm = svm.predict(X_test); y_pred_svm
Out[620]:
In [621]:
cm(y_test, y_pred_svm)
Out[621]:
In [622]:
precision_score(y_test, y_pred_svm, pos_label=1)
Out[622]:
In [623]:
recall_score(y_test, y_pred_svm, pos_label=1)
Out[623]:
In [624]:
y_scores_lr = lr.predict_proba(X_test)[:,1]; y_scores_lr
Out[624]:
In [625]:
print precision_score(y_test, y_pred_lr)
print recall_score(y_test, y_pred_lr)
In [626]:
threshold = 0.4
y_pred_lr_tweaked = map(lambda s : 1 if s > threshold else 0, y_scores_lr)
print y_pred_lr_tweaked
In [627]:
print precision_score(y_test, y_pred_lr_tweaked)
print recall_score(y_test, y_pred_lr_tweaked)
In [628]:
from sklearn.metrics import precision_recall_curve
In [629]:
pr, re, _ = precision_recall_curve(y_test, y_scores_lr, pos_label=1)
In [630]:
pr
Out[630]:
In [631]:
re
Out[631]:
In [632]:
plt.plot(re, pr)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()
In [633]:
from sklearn.metrics import average_precision_score
In [634]:
average_precision_score(y_test, y_scores_lr)
Out[634]:
In [635]:
y_scores_svm = svm.decision_function(X_test)[:,0]
print y_scores_svm
In [636]:
pr_lr, re_lr, _ = precision_recall_curve(y_test, y_scores_lr, pos_label=1)
pr_svm, re_svm, _ = precision_recall_curve(y_test, y_scores_svm, pos_label=1)
plt.plot(re_lr, pr_lr, label='LR')
plt.plot(re_svm, pr_svm, label='SVM')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()
In [637]:
print average_precision_score(y_test, y_scores_lr)
print average_precision_score(y_test, y_scores_svm)
In [540]:
from sklearn.metrics import roc_curve, auc
In [552]:
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_scores_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_scores_svm)
roc_auc_svm = auc(fpr_svm, tpr_svm)
In [599]:
plt.plot(fpr_lr, tpr_lr, label='LR ROC curve (area = %0.2f)' % roc_auc_lr)
plt.plot(fpr_svm, tpr_svm, label='SVM ROC curve (area = %0.2f)' % roc_auc_svm)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend(loc='lower right')
plt.show()
In [555]:
def f_beta(p, r, beta):
return ((1 + beta**2) * p * r) / (beta**2 * p + r)
In [557]:
f_beta(0.5, 0.9, 1)
Out[557]:
In [560]:
f_beta(0.5, 0.9, 0.5)
Out[560]:
In [561]:
f_beta(0.5, 0.9, 2)
Out[561]:
In [562]:
(0.5 + 0.9) / 2
Out[562]:
In [563]:
sqrt(0.5 * 0.9)
Out[563]:
In [565]:
2/(1/0.5 + 1/0.9)
Out[565]:
In [578]:
r = 0.5
xs = sp.linspace(0, 1)
plt.plot(xs, (xs + r)/2, label='aritm')
plt.plot(xs, sp.sqrt(xs*r), label='geom')
plt.plot(xs, 2/(1/xs + 1/r), label='harm')
plt.legend(loc='lower right')
plt.show()
In [669]:
data = sp.loadtxt("path/do/glass.data", delimiter=",", skiprows=1)
In [640]:
print data
In [670]:
shape(data)
Out[670]:
In [671]:
glass_X, glass_y = data[:,1:10], data[:,10]
In [672]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(glass_X, glass_y, train_size=2.0/3, random_state=42)
In [673]:
X_train.shape, X_test.shape
Out[673]:
In [675]:
from sklearn.svm import SVC
In [676]:
m = SVC() # SVC(C=1, gamma='auto')
In [677]:
m.fit(X_train, y_train)
Out[677]:
In [678]:
m.classes_
Out[678]:
In [648]:
y_pred = m.predict(X_test); y_pred
Out[648]:
In [649]:
from sklearn.metrics import confusion_matrix
In [650]:
confusion_matrix(y_test, y_pred)
Out[650]:
In [651]:
from sklearn.metrics import f1_score
In [667]:
f1_score(y_test, y_pred, pos_label=l, average=None)
Out[667]:
In [668]:
sp.mean(_)
Out[668]:
In [597]:
f1_score(y_test, y_pred, average='macro')
Out[597]:
In [598]:
f1_score(y_test, y_pred, average='micro')
Out[598]:
TODO
TODO
TODO
TODO