In [1]:
from IPython.display import Image
Image(url='http://graphics8.nytimes.com/images/section/learning/general/onthisday/big/0415_big.gif')
Out[1]:
In [2]:
%matplotlib inline
In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.grid_search import GridSearchCV
from IPython.display import Image
# Pandasになんか設定する
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.set_option.html
pd.set_option('chained_assignment', None)
# matplotlibのスタイルを指定する
plt.style.use('ggplot')
plt.rc('xtick.major', size=0)
plt.rc('ytick.major', size=0)
In [18]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
In [8]:
df_train.tail()
Out[8]:
In [20]:
df_test.tail()
Out[20]:
In [10]:
# 性別と生きたかどうかを取得
x = df_train['Sex']
y = df_train['Survived']
In [12]:
# 性別のみで推定(女性は生きる、男性は死ぬ
y_pred = x.map({'female': 1, 'male': 0}).astype(int)
In [13]:
# 正解率を表示
print('Accuracy: {:.3f}'.format(accuracy_score(y, y_pred)))
In [14]:
# Precision(適合率)、Recall(再現率)、F1-score(F1スコア、F値)をclassification_reportで計算
print(classification_report(y, y_pred))
In [16]:
# 混同行列を表示
cm = confusion_matrix(y, y_pred)
print(cm)
In [21]:
# 混同行列をグラフで表示
# 真陽性(True Positive) 偽陰性(False Negative)
# 偽陽性(False Positive) 真陰性(True Negative)
def plot_confusion_matrix(cm):
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.set_title('Confusion Matrix')
fig.colorbar(im)
target_names = ['not survived', 'survived']
tick_marks = np.arange(len(target_names))
ax.set_xticks(tick_marks)
ax.set_xticklabels(target_names, rotation=45)
ax.set_yticks(tick_marks)
ax.set_yticklabels(target_names)
ax.set_ylabel('True label')
ax.set_xlabel('Predicted label')
fig.tight_layout()
plot_confusion_matrix(cm)
In [23]:
# トレーニングデータと同様に、Kaggleに投稿するテストデータからも生存者を推定しましょう。
x_test = df_test['Sex']
y_test_pred = x_test.map({'female': 1, 'male': 0}).astype(int)
In [26]:
# Kaggleに投稿するファイルを生成
df_kaggle = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': np.array(y_test_pred)})
df_kaggle.to_csv('kaggle_gendermodel.csv', index=False)
In [25]:
df_kaggle.head()
Out[25]:
In [28]:
# 年齢、Pクラス、性別を使う
X = df_train[['Age', 'Pclass', 'Sex']]
y = df_train['Survived']
X.tail()
Out[28]:
In [29]:
# 年齢の欠損値を埋める
X['AgeFill'] = X['Age'].fillna(X['Age'].mean())
X = X.drop(['Age'], axis=1)
# 以下でも同じ
# X['Age'] = X['Age'].fillna(X['Age'].mean())
In [31]:
# 性別を数値にする
# 2つしか値がないから0, 1にしている
X['Gender'] = X['Sex'].map({'female': 0, 'male': 1}).astype(int)
X.tail()
Out[31]:
In [32]:
# 女性(Gender=0)で且つ、乗船クラスのランクが高い(Pclass=1)ほど、生存率が高いという仮説を表す新しい特徴量(Pclass_Gender)を作成する
# Pclass_Genderは値が小さいほど生存率が高いことになる
X['Pclass_Gender'] = X['Pclass'] + X['Gender']
X.tail()
Out[32]:
In [33]:
# 今回は特徴量としてPclass_GenderとAgeの2つを使うので、不要な特徴量を drop する
X = X.drop(['Pclass', 'Sex', 'Gender'], axis=1)
X.tail()
Out[33]:
In [41]:
# 横軸が年齢、縦軸がPclass_Genderのグラフを描く
np.random.seed = 0
xmin, xmax = -5, 85
ymin, ymax = 0.5, 4.5
index_survived = y[y==0].index
index_notsurvived = y[y==1].index
fig, ax = plt.subplots()
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
sc = ax.scatter(X.loc[index_survived, 'AgeFill'],
X.loc[index_survived, 'Pclass_Gender'] + (np.random.rand(len(index_survived))-0.5)*0.1,
color='r', label='Not Survived', alpha=0.3)
sc = ax.scatter(X.loc[index_notsurvived, 'AgeFill'],
X.loc[index_notsurvived, 'Pclass_Gender'] + (np.random.rand(len(index_notsurvived))-0.5)*0.1,
color='b', label='Survived', alpha=0.3)
ax.set_xlabel('AgeFill')
ax.set_ylabel('Pclass_Gender')
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.legend(bbox_to_anchor=(1.4, 1.03))
plt.show()
In [46]:
# 80%のトレーニングデータ、20%のテストデータに分割
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=1)
In [47]:
print('Num of Training Samples: {}'.format(len(X_train)))
print('Num of Validation Samples: {}'.format(len(X_val)))
In [48]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
Out[48]:
In [49]:
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)
In [52]:
# 結果を評価
print('Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))
print('Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_pred)))
In [53]:
# 混同行列を表示
cm = confusion_matrix(y_val, y_val_pred)
print(cm)
In [54]:
plot_confusion_matrix(cm)
In [56]:
print(classification_report(y_val, y_val_pred))
In [58]:
h = 0.02
xmin, xmax = -5, 85
ymin, ymax = 0.5, 4.5
xx, yy = np.meshgrid(np.arange(xmin, xmax, h), np.arange(ymin, ymax, h))
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
fig, ax = plt.subplots()
levels = np.linspace(0, 1.0, 5)
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
contour = ax.contourf(xx, yy, Z, cmap=cm, levels=levels, alpha=0.8)
ax.scatter(X_train.iloc[:, 0], X_train.iloc[:, 1]+(np.random.rand(len(X_train))-0.5)*0.1, c=y_train, cmap=cm_bright)
ax.scatter(X_val.iloc[:, 0], X_val.iloc[:, 1]+(np.random.rand(len(X_val))-0.5)*0.1, c=y_val, cmap=cm_bright, alpha=0.5)
ax.set_xlabel('AgeFill')
ax.set_ylabel('Pclass_Gender')
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
fig.colorbar(contour)
x1 = xmin
x2 = xmax
y1 = -1*(clf.intercept_[0]+clf.coef_[0][0]*xmin)/clf.coef_[0][1]
y2 = -1*(clf.intercept_[0]+clf.coef_[0][0]*xmax)/clf.coef_[0][1]
ax.plot([x1, x2] ,[y1, y2], 'k--')
plt.show()
In [59]:
# サポートベクターマシン使ってみる
clf_svc_lin = SVC(kernel='linear', probability=True)
clf_svc_lin.fit(X_train, y_train)
y_train_svc_pred = clf_svc_lin.predict(X_train)
y_val_svc_pred = clf_svc_lin.predict(X_val)
print('Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_svc_pred)))
print('Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_svc_pred)))
In [60]:
cm_svc = confusion_matrix(y_val, y_val_svc_pred)
print(cm_svc)
plot_confusion_matrix(cm_svc)
In [61]:
print(classification_report(y_val, y_val_svc_pred))
In [ ]: