필요한 패키지와 타이타닉 데이터 로드 하기


In [1]:
import random
import pandas
import numpy as np
import tensorflow as tf
from sklearn import metrics, cross_validation
from tensorflow.contrib import skflow

In [2]:
random.seed(42)

In [3]:
data = pandas.read_csv('data/titanic_train.csv')

In [4]:
X = data[["Embarked"]]

In [5]:
y = data[["Survived"]]

In [6]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
embarked_classes = X_train["Embarked"].unique()

In [8]:
print('Embarked has next classes: ', embarked_classes)


Embarked has next classes:  ['S' 'C' 'Q' nan]

In [9]:
cat_processor = skflow.preprocessing.CategoricalProcessor()

In [10]:
X_train = np.array(list(cat_processor.fit_transform(X_train)))

In [11]:
X_test = np.array(list(cat_processor.fit_transform(X_test)))

Categorical Processor로 부터 변수에 대한 클래스의 총 갯수. 변수의 고유한 클래스와 미지정 값도 포함됨.


In [12]:
n_classes = len(cat_processor.vocabularies_[0])

Embedding


In [13]:
EMBEDDING_SIZE = 3

In [14]:
def categorical_model(X, y):
    features = skflow.ops.categorical_variable(
        X, n_classes, embedding_size=EMBEDDING_SIZE, name='embarked')
    return skflow.models.logistic_regression(tf.squeeze(features, [1]), y)

In [15]:
classifier = skflow.TensorFlowEstimator(model_fn=categorical_model, n_classes=2)

In [16]:
classifier.fit(X_train, y_train['Survived'])


Step #100, epoch #4, avg. train loss: 0.65760
Step #200, epoch #8, avg. train loss: 0.65196
Out[16]:
TensorFlowEstimator(batch_size=32, class_weight=None, clip_gradients=5.0,
          config=None, continue_training=False, learning_rate=0.1,
          model_fn=<function categorical_model at 0x1127420d0>,
          n_classes=2, optimizer='Adagrad', steps=200, verbose=1)

In [17]:
print("Accuracy: {0}".format(metrics.accuracy_score(classifier.predict(X_test), y_test)))


Accuracy: 0.6256983240223464

In [18]:
print("ROC: {0}".format(metrics.roc_auc_score(classifier.predict(X_test), y_test)))


ROC: 0.6105506155950753

One Hot


In [19]:
def one_hot_categorical_model(X, y):
    features = skflow.ops.one_hot_matrix(X, n_classes)
    return skflow.models.logistic_regression(tf.squeeze(features, [1]), y)

In [20]:
classifier = skflow.TensorFlowEstimator(model_fn=one_hot_categorical_model,
                                       n_classes=2, steps=1000, learning_rate=0.01)

In [21]:
classifier.fit(X_train, y_train['Survived'])


Step #100, epoch #4, avg. train loss: 0.84168
Step #200, epoch #8, avg. train loss: 0.75244
Step #300, epoch #13, avg. train loss: 0.72017
Step #400, epoch #17, avg. train loss: 0.69646
Step #500, epoch #21, avg. train loss: 0.68045
Step #600, epoch #26, avg. train loss: 0.67679
Step #700, epoch #30, avg. train loss: 0.67083
Step #800, epoch #34, avg. train loss: 0.66649
Step #900, epoch #39, avg. train loss: 0.66256
Step #1000, epoch #43, avg. train loss: 0.66194
Out[21]:
TensorFlowEstimator(batch_size=32, class_weight=None, clip_gradients=5.0,
          config=None, continue_training=False, learning_rate=0.01,
          model_fn=<function one_hot_categorical_model at 0x1129a6158>,
          n_classes=2, optimizer='Adagrad', steps=1000, verbose=1)

In [22]:
print("Accuracy: {0}".format(metrics.accuracy_score(classifier.predict(X_test), y_test)))


Accuracy: 0.6201117318435754

In [23]:
print("ROC: {0}".format(metrics.roc_auc_score(classifier.predict(X_test), y_test)))


ROC: 0.6027310924369748

In [ ]: