In [ ]:
# Sample Generation
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
# X为样本特征,y为样本簇类别,共1000个样本,每个样本2个特征,共2个簇
x_all, y_all = make_blobs(n_samples=1000, n_features=2,
centers=[[5,5], [6,6]],
cluster_std=[0.3, 0.4],
random_state=9)
plt.figure()
colors = ['r', 'b']
for y, c in zip(np.unique(y_all), colors):
plt.scatter(X_all[y_all==y, 0], x_all[y_all==y, 1], c=c, label=y, marker='o')
In [ ]:
# Explain decision_function, predict_proba and predict
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
def sigmoid_array(x):
return 1 / (1 + np.exp(-x))
x_all, y_all = make_blobs(n_samples=1000, n_features=2,
centers=[[5,5], [6,6]],
cluster_std=[1, 1],
random_state=9)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, random_state=1)
clf = LogisticRegression(C=1, intercept_scaling=1, random_state=1)
clf.fit(x_train, y_train)
print(clf.classes_)
print(clf.coef_)
print(clf.intercept_)
print(clf.n_iter_)
print(clf.densify())
print(clf.get_params(deep=True))
y_pred = clf.predict(x_test)
y_pred_proba = clf.predict_proba(x_test)
y_confidence = clf.decision_function(x_test)
y_sigmoid = sigmoid_array(y_confidence)
result = pd.DataFrame({'y': y_test,
'y_pred': y_pred,
'y_pred_proba': y_pred_proba[:, 1],
'y_confidence': y_confidence,
'y_sigmoid': y_sigmoid})
print(result)
In [ ]:
# Explain how to calculate decision_function
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
def sigmoid_array(x):
return 1 / (1 + np.exp(-x))
x_all, y_all = make_blobs(n_samples=1000, n_features=2,
centers=[[5,5], [6,6]],
cluster_std=[1, 1],
random_state=9)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, random_state=1)
clf = LogisticRegression(C=1, intercept_scaling=1, random_state=1)
clf.fit(x_train, y_train)
print(clf.classes_)
print(clf.coef_)
print(clf.intercept_)
def func(x):
return x[0] * clf.coef_[0][0] + x[1] * clf.coef_[0][1] + clf.intercept_
y_calc = np.apply_along_axis(func, axis=1, arr=x_test)
y_confidence = clf.decision_function(x_test)
result = pd.DataFrame({'y_calc': y_calc[:, 0], 'y_confidence': y_confidence})
print(result)
print('Finished')
In [ ]:
# Example for intercept_scaling in LogisticRegression
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
x_all, y_all = make_blobs(n_samples=1000, n_features=2,
centers=[[5,5], [6,6]],
cluster_std=[2, 2],
random_state=9)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, random_state=1)
sc_list = np.linspace(0.1, 1, 10)
for sc in sc_list:
print('intercept_scaling = {}'.format(sc))
clf = LogisticRegression(C=1, intercept_scaling=sc, random_state=1)
clf.fit(x_train, y_train)
# print(classification_report(y_train, clf.predict(x_train)))
print(classification_report(y_test, clf.predict(x_test)))
In [ ]:
# Example for Prediction Threshold
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
x_all, y_all = make_blobs(n_samples=1000, n_features=2,
centers=[[5,5], [6,6]],
cluster_std=[2, 2],
random_state=9)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, random_state=1)
clf = LogisticRegression(C=1, intercept_scaling=1, random_state=1)
clf.fit(x_train, y_train)
print(classification_report(y_train, clf.predict(x_train)))
print(classification_report(y_test, clf.predict(x_test)))
pred_proba_train = clf.predict_proba(x_train)[:, 1]
pred_proba_test = clf.predict_proba(x_test)[:, 1]
print(classification_report(y_train, (pred_proba_train > 0.6)))
print(classification_report(y_test, (pred_proba_test > 0.6)))
In [ ]:
# Grid Search
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import svm
x_all, y_all = make_blobs(n_samples=1000, n_features=2,
centers=[[5,5], [6,6]],
cluster_std=[2, 2],
random_state=9)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)
tuned_parameters = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'intercept_scaling': [0.1, 0.5, 1, 5, 10]}]
# Tuning hyper-parameters
clf = GridSearchCV(LogisticRegression(),tuned_parameters, cv=5)
clf.fit(x_train, y_train)
print("Best score found on test set:")
print(clf.best_score_)
print()
print("Best score found on test set:")
print(clf.best_estimator_)
print()
print("Best parameters set found on test set:")
print(clf.best_params_)
print(clf.best_estimator_.get_params())
print()
print("Grid scores on test set:")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, std, params in zip(means, stds, params):
print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print()
print("Detailed classification report:")
y_true, y_pred = y_test, clf.predict(x_test)
print('Score of best_estimator_: {}'.format(clf.score(x_test, y_test)))
print(classification_report(y_true, y_pred))