In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/datascienceschool/docker_rpython/master/data/titanic.csv')
df.tail()


Out[1]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
886 0 2 male 27.0 0 0 13.00 S Second man True NaN Southampton no True
887 1 1 female 19.0 0 0 30.00 S First woman False B Southampton yes True
888 0 3 female NaN 1 2 23.45 S Third woman False NaN Southampton no False
889 1 1 male 26.0 0 0 30.00 C First man True C Cherbourg yes True
890 0 3 male 32.0 0 0 7.75 Q Third man True NaN Queenstown no True
  • column explanation

survived : 생존여부 pclass : 1,2,3등석 sex : 성별 age : 나이 sibsb : 같이 탑승한 형제, 배우자수 parch : 같이 탑승한 부모, 자녀의수 fare : 요금 embarked : 탑승한곳


In [2]:
df1 = df.ix[:,0:8]
df1.tail()  # 박사님께서 설명을 위해 뒷부분에 컬럼을 채워놓은 것같아서 who 부터 끝까지 잘랐습니다


Out[2]:
survived pclass sex age sibsp parch fare embarked
886 0 2 male 27.0 0 0 13.00 S
887 1 1 female 19.0 0 0 30.00 S
888 0 3 female NaN 1 2 23.45 S
889 1 1 male 26.0 0 0 30.00 C
890 0 3 male 32.0 0 0 7.75 Q

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1['sex']= le.fit_transform(df1['sex'])
df1['embarked'] = le.fit_transform(df1['embarked'])


c:\python27\lib\site-packages\numpy\lib\arraysetops.py:216: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.
  flag = np.concatenate(([True], aux[1:] != aux[:-1]))

In [4]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy = 'median', axis = 0)
df2 = imp.fit_transform(df1)
df2


Out[4]:
array([[  0.    ,   3.    ,   1.    , ...,   0.    ,   7.25  ,   3.    ],
       [  1.    ,   1.    ,   0.    , ...,   0.    ,  71.2833,   1.    ],
       [  1.    ,   3.    ,   0.    , ...,   0.    ,   7.925 ,   3.    ],
       ..., 
       [  0.    ,   3.    ,   0.    , ...,   2.    ,  23.45  ,   3.    ],
       [  1.    ,   1.    ,   1.    , ...,   0.    ,  30.    ,   1.    ],
       [  0.    ,   3.    ,   1.    , ...,   0.    ,   7.75  ,   2.    ]])

In [5]:
df3 = pd.DataFrame(df2, columns = ['survived', 'pclass','sex','age','sibsp','parch','fare','embarked'])
df3.tail()


Out[5]:
survived pclass sex age sibsp parch fare embarked
886 0.0 2.0 1.0 27.0 0.0 0.0 13.00 3.0
887 1.0 1.0 0.0 19.0 0.0 0.0 30.00 3.0
888 0.0 3.0 0.0 28.0 1.0 2.0 23.45 3.0
889 1.0 1.0 1.0 26.0 0.0 0.0 30.00 1.0
890 0.0 3.0 1.0 32.0 0.0 0.0 7.75 2.0

In [6]:
df3.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
survived    891 non-null float64
pclass      891 non-null float64
sex         891 non-null float64
age         891 non-null float64
sibsp       891 non-null float64
parch       891 non-null float64
fare        891 non-null float64
embarked    891 non-null float64
dtypes: float64(8)
memory usage: 55.7 KB

In [6]:
y = df3.ix[:,0]
x = df3.ix[:,1:]

train_test split


In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state= 42 )

QDA


In [8]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis(store_covariances = True).fit(x,y)

In [84]:
qda.priors_


Out[84]:
array([ 0.61616162,  0.38383838])

In [90]:
549.0/(549 + 342)


Out[90]:
0.6161616161616161

In [73]:
qda.means_


Out[73]:
array([[  2.53187614,   0.85245902,  30.02823315,   0.55373406,
          0.32969035,  22.11788689,   2.64116576],
       [  1.9502924 ,   0.31871345,  28.29143275,   0.47368421,
          0.46491228,  48.3954076 ,   2.35087719]])

In [75]:
qda.covariances_


Out[75]:
[array([[  5.41409065e-01,  -4.72956803e-02,  -3.62544540e+00,
           1.19178201e-01,   4.14788667e-02,  -1.19491607e+01,
           5.61671520e-02],
        [ -4.72956803e-02,   1.26002154e-01,   6.44684097e-01,
          -9.69845638e-02,  -1.04553069e-01,  -1.33989712e-01,
          -3.76929520e-03],
        [ -3.62544540e+00,   6.44684097e-01,   1.56249658e+02,
          -4.67077168e+00,  -8.52390877e-01,   2.81618757e+01,
          -4.10470929e-01],
        [  1.19178201e-01,  -9.69845638e-02,  -4.67077168e+00,
           1.65997235e+00,   5.06887107e-01,   1.14085927e+01,
           8.59226464e-02],
        [  4.14788667e-02,  -1.04553069e-01,  -8.52390877e-01,
           5.06887107e-01,   6.77602276e-01,   8.97361674e+00,
           2.54543762e-02],
        [ -1.19491607e+01,  -1.33989712e-01,   2.81618757e+01,
           1.14085927e+01,   8.97361674e+00,   9.85219509e+02,
          -2.89427734e+00],
        [  5.61671520e-02,  -3.76929520e-03,  -4.10470929e-01,
           8.59226464e-02,   2.54543762e-02,  -2.89427734e+00,
           5.04214697e-01]]),
 array([[  7.45322495e-01,   2.17540430e-02,  -4.45467931e+00,
          -2.03735144e-02,   1.43797911e-02,  -3.09392328e+01,
           1.08401502e-01],
        [  2.17540430e-02,   2.17771947e-01,  -2.90604603e-01,
          -2.82450996e-02,  -3.42388229e-02,  -2.42099011e+00,
           2.86052374e-02],
        [ -4.45467931e+00,  -2.90604603e-01,   1.89459406e+02,
          -1.32197098e+00,  -3.20943638e+00,   1.46217468e+02,
          -5.19477800e-01],
        [ -2.03735144e-02,  -2.82450996e-02,  -1.32197098e+00,
           5.02238000e-01,   1.54499151e-01,   5.79979932e+00,
           1.21932397e-02],
        [  1.43797911e-02,  -3.42388229e-02,  -3.20943638e+00,
           1.54499151e-01,   5.95539435e-01,   5.98832080e+00,
           5.63358543e-02],
        [ -3.09392328e+01,  -2.42099011e+00,   1.46217468e+02,
           5.79979932e+00,   5.98832080e+00,   4.43516016e+03,
          -1.45448050e+01],
        [  1.08401502e-01,   2.86052374e-02,  -5.19477800e-01,
           1.21932397e-02,   5.63358543e-02,  -1.45448050e+01,
           8.09075475e-01]])]

In [9]:
qda_score_train = qda.score(x_train,y_train)
print "Train Score :", qda_score_train
print "=" * 40
qda_score_test = qda.score(x_test, y_test)
print "Test Score :", qda_score_test


Train Score : 0.80033557047
========================================
Test Score : 0.830508474576

LDA


In [11]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
LDA = LinearDiscriminantAnalysis(n_components = 2, solver = 'svd', store_covariance = True).fit(x,y)

solver svd : Singular value decomposition (default). Does not compute the covariance matrix, therefore this solver is recommended for data with a large number of features. lsqr : Least squares solution, can be combined with shrinkage. eigen : Eigenvalue decomposition, can be combined with shrinkage


In [101]:
LDA.coef_


Out[101]:
array([[ -1.18717081e+00,  -3.54813783e+00,  -4.04744326e-02,
         -2.89510309e-01,  -1.13110270e-01,   1.99004492e-03,
         -2.44515430e-01]])

In [102]:
LDA.intercept_


Out[102]:
array([ 6.17904688])

In [103]:
LDA.covariance_


Out[103]:
array([[  6.18234723e-01,  -2.07630798e-02,  -3.93466860e+00,
          6.55020039e-02,   3.10145092e-02,  -1.91901442e+01,
          7.60319995e-02],
       [ -2.07630798e-02,   1.60841093e-01,   2.85286998e-01,
         -7.04591694e-02,  -7.74079917e-02,  -1.00896071e+00,
          8.62941885e-03],
       [ -3.93466860e+00,   2.85286998e-01,   1.68608833e+02,
         -3.37864757e+00,  -1.75255669e+00,   7.32804315e+01,
         -4.51268237e-01],
       [  6.55020039e-02,  -7.04591694e-02,  -3.37864757e+00,
          1.21316274e+00,   3.70884787e-01,   9.23640893e+00,
          5.75123512e-02],
       [  3.10145092e-02,  -7.74079917e-02,  -1.75255669e+00,
          3.70884787e-01,   6.44674517e-01,   7.81095327e+00,
          3.72160769e-02],
       [ -1.91901442e+01,  -1.00896071e+00,   7.32804315e+01,
          9.23640893e+00,   7.81095327e+00,   2.30335567e+03,
         -7.34662457e+00],
       [  7.60319995e-02,   8.62941885e-03,  -4.51268237e-01,
          5.75123512e-02,   3.72160769e-02,  -7.34662457e+00,
          6.19758014e-01]])

In [104]:
LDA.means_


Out[104]:
array([[  2.53187614,   0.85245902,  30.02823315,   0.55373406,
          0.32969035,  22.11788689,   2.64116576],
       [  1.9502924 ,   0.31871345,  28.29143275,   0.47368421,
          0.46491228,  48.3954076 ,   2.35087719]])

In [80]:
LDA.n_components


Out[80]:
2

In [105]:
LDA.explained_variance_ratio_


Out[105]:
array([ 1.])

In [12]:
LDA_score_train = LDA.score(x_train,y_train)
print "Train Score :", LDA_score_train
print "=" * 40
LDA_score_test = LDA.score(x_test, y_test)
print "Test Score :", LDA_score_test


Train Score : 0.796979865772
========================================
Test Score : 0.796610169492

NB


In [13]:
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB().fit(x,y)

In [108]:
NB.classes_


Out[108]:
array([ 0.,  1.])

In [109]:
NB.class_count_


Out[109]:
array([ 549.,  342.])

In [111]:
NB.class_prior_


Out[111]:
array([ 0.61616162,  0.38383838])

In [113]:
NB.theta_


Out[113]:
array([[  2.53187614,   0.85245902,  30.02823315,   0.55373406,
          0.32969035,  22.11788689,   2.64116576],
       [  1.9502924 ,   0.31871345,  28.29143275,   0.47368421,
          0.46491228,  48.3954076 ,   2.35087719]])

In [14]:
NB_score_train = NB.score(x_train,y_train)
print "Train Score :", NB_score_train
print "=" * 40
NB_score_test = NB.score(x_test, y_test)
print "Test Score :", NB_score_test


Train Score : 0.788590604027
========================================
Test Score : 0.8

BernoulliNB 쓸수 없음 => 타겟변수뿐만아니라 독립변수도 0또는 1값을 가져야 하기때문

Multinomial NB


In [15]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB().fit(x,y)

In [137]:
MNB.classes_


Out[137]:
array([ 0.,  1.])

In [138]:
MNB.class_count_


Out[138]:
array([ 549.,  342.])

In [16]:
MNB_score_train = MNB.score(x_train,y_train)
print "Train Score :", MNB_score_train
print "=" * 40
MNB_score_test = MNB.score(x_test, y_test)
print "Test Score :", MNB_score_test


Train Score : 0.671140939597
========================================
Test Score : 0.718644067797

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
clf_1 = Pipeline([
    ('clf', MultinomialNB())
])

In [18]:
clf_1.fit(x_train, y_train)
print(classification_report(y_test, clf_1.predict(x_test), digits = 4))


             precision    recall  f1-score   support

        0.0     0.7136    0.8686    0.7835       175
        1.0     0.7195    0.4917    0.5842       120

avg / total     0.7160    0.7153    0.7024       295

Decision Tree


In [17]:
from sklearn.tree import DecisionTreeClassifier
tree1 = DecisionTreeClassifier(criterion = 'entropy', max_depth = 1, random_state = 0).fit(x,y)

In [18]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5).fit(x_train, y_train)

In [19]:
from sklearn import tree
decision = tree.DecisionTreeClassifier()
decision.fit(x_train, y_train)
decision_score_train = decision.score(x_train, y_train)
print "Train score : ",decision_score_train
decision_score_test = decision.score(x_test, y_test)
print "-" * 40
print "Test score : ",decision_score_test


Train score :  0.979865771812
----------------------------------------
Test score :  0.742372881356

In [ ]:

Percetron

  • iter에 따라 달라지는 것

In [213]:
from sklearn.linear_model import Perceptron
perceptron = Perceptron(n_iter = 1, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test


Train score :  0.627516778523
========================================
Test score :  0.593220338983

In [214]:
perceptron = Perceptron(n_iter = 5, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test


Train score :  0.627516778523
========================================
Test score :  0.593220338983

In [215]:
perceptron = Perceptron(n_iter = 10, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test


Train score :  0.694630872483
========================================
Test score :  0.691525423729

In [216]:
perceptron = Perceptron(n_iter = 20, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test


Train score :  0.744966442953
========================================
Test score :  0.796610169492

In [217]:
perceptron = Perceptron(n_iter = 100, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test


Train score :  0.771812080537
========================================
Test score :  0.796610169492

In [220]:
perceptron = Perceptron(n_iter = 150, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test


Train score :  0.706375838926
========================================
Test score :  0.71186440678

In [ ]:

SVM


In [21]:
from sklearn.svm import SVC
svm = SVC() # defalt 값은 rbf 이다
svm.fit(x_train, y_train)
svm_score_train = svm.score(x_train, y_train)
print "Train Score : ", svm_score_train
svm_score_test = svm.score(x_test, y_test)
print "-" * 40
print "Test Score : ", svm_score_test


Train Score :  0.911073825503
----------------------------------------
Test Score :  0.684745762712

In [22]:
from sklearn.svm import SVC
svm = SVC(kernel = 'linear')
svm.fit(x_train, y_train)
svm_score_train = svm.score(x_train, y_train)
print "Train Score : ", svm_score_train
svm_score_test = svm.score(x_test, y_test)
print "-" * 40
print "Test Score : ", svm_score_test


Train Score :  0.781879194631
----------------------------------------
Test Score :  0.796610169492

In [22]:
from sklearn.svm import SVC
svm = SVC(kernel = 'sigmoid')
svm.fit(x_train, y_train)
svm_score_train = svm.score(x_train, y_train)
print "Train Score : ", svm_score_train
svm_score_test = svm.score(x_test, y_test)
print "-" * 40
print "Test Score : ", svm_score_test


Train Score :  0.627516778523
----------------------------------------
Test Score :  0.593220338983

In [ ]:
from sklearn.svm import SVC
svm = SVC(kernel = 'poly')
svm.fit(x_train, y_train)
svm_score_train = svm.score(x_train, y_train)
print "Train Score : ", svm_score_train
svm_score_test = svm.score(x_test, y_test)
print "-" * 40
print "Test Score : ", svm_score_test

model comparing


In [23]:
models = pd.DataFrame({
        'Model'          : ['QDA', 'LDA', 'GausianNB','MultinomialNB', 'DecisionTree','SVM'],
        'Train_Score' : [qda_score_train, LDA_score_train, NB_score_train, MNB_score_train, decision_score_train, svm_score_train],
        'Test_Score'  : [qda_score_test, LDA_score_test, NB_score_test, MNB_score_test, decision_score_test, svm_score_test]
    })
models.sort_values(by='Test_Score', ascending=True)


Out[23]:
Model Test_Score Train_Score
3 MultinomialNB 0.718644 0.671141
4 DecisionTree 0.742373 0.979866
1 LDA 0.796610 0.796980
5 SVM 0.796610 0.781879
2 GausianNB 0.800000 0.788591
0 QDA 0.830508 0.800336

Pipe Line - 더 효율적일까?


In [ ]: