필요한 모듈 임포팅


In [1]:
import pandas
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

pandas를 이용하여 타이타닉 데이터를 로드 하고 데이터 구조를 살핍니다. 타이타닉 데이터는 data 폴더 밑에 있습니다.


In [2]:
train = pandas.read_csv('data/titanic_train.csv')

In [3]:
train.shape


Out[3]:
(891, 12)

In [4]:
train.columns


Out[4]:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
train.head()


Out[5]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

scikit-learn을 이용한 로지스틱 회귀분석


In [6]:
y, X = train['Survived'], train[['Age', 'SibSp', 'Fare']].fillna(0)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
lr = LogisticRegression()

In [9]:
lr.fit(X_train, y_train)


Out[9]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
print(accuracy_score(lr.predict(X_test), y_test))


0.664804469274

tensorflow를 이용한 로지스틱 회귀분석


In [11]:
from tensorflow.contrib import skflow
import random

In [12]:
random.seed(42)

In [13]:
classifier = skflow.TensorFlowLinearClassifier(n_classes=2, batch_size=128, steps=500, learning_rate=0.05)

In [14]:
classifier.fit(X_train, y_train)


Step #100, epoch #16, avg. train loss: 0.68626
Step #200, epoch #33, avg. train loss: 0.63602
Step #300, epoch #50, avg. train loss: 0.63053
Step #400, epoch #66, avg. train loss: 0.62640
Step #500, epoch #83, avg. train loss: 0.62822
Out[14]:
TensorFlowLinearClassifier(batch_size=128, class_weight=None,
              clip_gradients=5.0, config=None, continue_training=False,
              learning_rate=0.05, n_classes=2, optimizer='Adagrad',
              steps=500, verbose=1)

In [15]:
print(accuracy_score(classifier.predict(X_test), y_test))


0.670391061453

In [ ]: