In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
In [2]:
data = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv')
data['pclass'] = data['pclass'].astype(str) # pclassの型を文字列型に変換
In [3]:
data.columns
Out[3]:
In [4]:
data.describe()
Out[4]:
In [5]:
data[['age']].dropna().describe()
Out[5]:
In [6]:
data[['fare']].dropna().describe()
Out[6]:
In [7]:
data[['body']].dropna().describe()
Out[7]:
In [8]:
data[['age']].dropna().plot(kind='hist', bins=16)
Out[8]:
In [9]:
data[['fare']].dropna().plot(kind='hist', bins=20)
Out[9]:
In [10]:
df = data[['age','fare']].dropna()
df.plot(kind='scatter', x='age', y='fare')
Out[10]:
In [11]:
df = data[['fare','pclass']].dropna()
df.boxplot(column='fare', by='pclass')
Out[11]:
In [12]:
df1 = data[data.pclass=='1'][['age','fare']].dropna()
df2 = data[data.pclass=='2'][['age','fare']].dropna()
df3 = data[data.pclass=='3'][['age','fare']].dropna()
plt.scatter(df1.age, df1.fare, facecolor='blue')
plt.scatter(df2.age, df2.fare, facecolor='green')
plt.scatter(df3.age, df3.fare, facecolor='red')
Out[12]:
In [13]:
df = data[['sex','survived']].dropna()
pd.crosstab(df.sex, df.survived)
Out[13]:
In [14]:
pd.crosstab(data.sex ,data.survived).plot(kind='bar')
Out[14]:
In [15]:
df.mean()
Out[15]:
In [16]:
339.0/(127+339)
Out[16]:
In [17]:
df = data[['pclass','survived']].dropna()
pd.crosstab(df.pclass, df.survived)
Out[17]:
In [18]:
pd.crosstab(data.pclass ,data.survived).plot(kind='bar')
Out[18]:
In [19]:
_, bins = np.histogram(data.age.dropna(), bins=20)
data.reset_index().pivot('index','survived','age').plot(kind='hist', bins=16, alpha=0.5)
Out[19]:
In [20]:
from PIL import Image
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
In [21]:
tmp = data[['age', 'sex', 'survived']].dropna()
X_ = tmp[['age', 'sex']]
y = tmp['survived']
X_.head()
Out[21]:
In [22]:
X = pd.get_dummies(X_)
X.head()
Out[22]:
In [23]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=1)
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)
print 'Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_pred))
print 'Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_pred))
In [24]:
clf = LogisticRegression()
scores = cross_val_score(clf, X, y, cv=5)
print 'Scores:', scores
print 'Mean Score: {:f} ± {:.3}'.format(scores.mean(), scores.std())
In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=1)
clf = DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=2)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)
print 'Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_pred))
print 'Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_pred))
In [26]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=2)
scores = cross_val_score(clf, X, y, cv=5)
print 'Scores:', scores
print 'Mean Score: {:f} ± {:.3}'.format(scores.mean(), scores.std())
In [27]:
clf.fit(X, y)
export_graphviz(clf, out_file='tree.dot',
feature_names=X.columns,
class_names=['not survived', 'survived'],
impurity=False, filled=True)
!dot -Tpng tree.dot -o tree.png
Image.open("tree.png")
Out[27]:
In [28]:
tmp = data[['age', 'sex', 'pclass', 'survived']].dropna()
X_ = tmp[['age', 'sex', 'pclass']]
y = tmp['survived']
X_.head()
Out[28]:
In [29]:
X = pd.get_dummies(X_)
X.head()
Out[29]:
In [30]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=2)
scores = cross_val_score(clf, X, y, cv=5)
print 'Scores:', scores
print 'Mean Score: {:f} ± {:.3}'.format(scores.mean(), scores.std())
In [31]:
clf.fit(X, y)
export_graphviz(clf, out_file='tree.dot',
feature_names=X.columns,
class_names=['not survived', 'survived'],
impurity=False, filled=True)
!dot -Tpng tree.dot -o tree.png
Image.open("tree.png")
Out[31]: