notebook.community

Edit and run



In [42]:

    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



In [43]:

    
import warnings
warnings.filterwarnings('ignore')



In [44]:

    
sns.set()



In [45]:

    
train_df = pd.read_csv('.\\data\\titanic_train.csv', index_col='PassengerId')

y = train_df.Survived
train_df = train_df.drop(columns=['Survived'])



In [46]:

    
test_df = pd.read_csv('.\\data\\titanic_test.csv', index_col='PassengerId')



In [47]:

    
df = pd.concat([train_df, test_df])



In [48]:

    
df.head(n=2)









    Out[48]:







  
    
      
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
    
      PassengerId
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      2
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C



In [49]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 10 columns):
Pclass      1309 non-null int64
Name        1309 non-null object
Sex         1309 non-null object
Age         1046 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Ticket      1309 non-null object
Fare        1308 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB



In [50]:

    
# age ...
age_by_class = df.groupby('Pclass')['Age'].median()
df.loc[(df['Pclass'] == 1) & (df['Age'].isnull()), 'Age'] = age_by_class[1]
df.loc[(df['Pclass'] == 2) & (df['Age'].isnull()), 'Age'] = age_by_class[2]
df.loc[(df['Pclass'] == 3) & (df['Age'].isnull()), 'Age'] = age_by_class[3]

df['L_Age'] = np.log(df.Age.values)
df['C_Age'] = pd.cut(df.Age, 5)

df['IsYoung'] = df['Age'] < 16
df.IsYoung = df.IsYoung.astype(int)

# fare, set 0 to nan ...
df.loc[df.Fare == 0, 'Fare'] = np.nan

fare_by_class = df.groupby('Pclass')['Fare'].median()
df.loc[df['Pclass'] == 1, 'Fare'] = df[df['Pclass'] == 1].Fare.fillna(fare_by_class[1])
df.loc[df['Pclass'] == 2, 'Fare'] = df[df['Pclass'] == 2].Fare.fillna(fare_by_class[2])
df.loc[df['Pclass'] == 3, 'Fare'] = df[df['Pclass'] == 3].Fare.fillna(fare_by_class[3])

df['L_Fare'] = np.log(df.Fare.values)
df['C_Fare'] = pd.qcut(df.Fare, 4)

# title
df.Name = df.Name.astype(str)
df['Title'] = df.Name.str.split(',').map(lambda x: x[1].strip()).str.split('.').map(lambda x: x[0].strip())

# unmarried women
df['Title'] = df.Title.replace(['Mlle','Ms'], 'Miss')
# madame
df['Title'] = df.Title.replace(['Mme'], 'Mrs')
# masters
df['Title'] = df.Title.replace(['Jonkheer'], 'Master')
# other men
df['Title'] = df.Title.replace(['Capt', 'Don', 'Major', 'Col', 'Sir', 'Dr', 'Rev'], 'Other_Men')
# other women
df['Title'] = df.Title.replace(['Lady', 'the Countess', 'Dona'], 'Other_Women')
# encoding, just for fun
df['C_Title'] = df.Title.map({ 'Mr': 1, 'Other_Men': 2, 'Master': 3, 'Miss': 4, 'Mrs': 5, 'Other_Women': 6 })

# embarked
df.Embarked = df.Embarked.fillna('S')

# size
df['Size'] = df.SibSp + df.Parch + 1

# is alone
df['IsAlone'] = df.Size == 1
df.IsAlone = df.IsAlone.astype(int)

# categories
df.Title = df.Title.astype('category')
df.Pclass = df.Pclass.astype(int)
df.Sex = df.Sex == 'male'
df.Sex = df.Sex.astype(int)
df.Embarked = df.Embarked.astype('category')



In [51]:

    
df = pd.concat([df, pd.get_dummies(df.Embarked, prefix='Embarked')], axis=1)
df = pd.concat([df, pd.get_dummies(df.Title, prefix='Title')], axis=1)
df = pd.concat([df, pd.get_dummies(df.Pclass, prefix='Pclass')], axis=1)
df = pd.concat([df, pd.get_dummies(df.Sex, prefix='Sex')], axis=1)
df = pd.concat([df, pd.get_dummies(df.C_Age, prefix='CategoryAge')], axis=1)
df = pd.concat([df, pd.get_dummies(df.C_Fare, prefix='CategoryFare')], axis=1)



In [52]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 42 columns):
Pclass                            1309 non-null int32
Name                              1309 non-null object
Sex                               1309 non-null int32
Age                               1309 non-null float64
SibSp                             1309 non-null int64
Parch                             1309 non-null int64
Ticket                            1309 non-null object
Fare                              1309 non-null float64
Cabin                             295 non-null object
Embarked                          1309 non-null category
L_Age                             1309 non-null float64
C_Age                             1309 non-null category
IsYoung                           1309 non-null int32
L_Fare                            1309 non-null float64
C_Fare                            1309 non-null category
Title                             1309 non-null category
C_Title                           1309 non-null int64
Size                              1309 non-null int64
IsAlone                           1309 non-null int32
Embarked_C                        1309 non-null uint8
Embarked_Q                        1309 non-null uint8
Embarked_S                        1309 non-null uint8
Title_Master                      1309 non-null uint8
Title_Miss                        1309 non-null uint8
Title_Mr                          1309 non-null uint8
Title_Mrs                         1309 non-null uint8
Title_Other_Men                   1309 non-null uint8
Title_Other_Women                 1309 non-null uint8
Pclass_1                          1309 non-null uint8
Pclass_2                          1309 non-null uint8
Pclass_3                          1309 non-null uint8
Sex_0                             1309 non-null uint8
Sex_1                             1309 non-null uint8
CategoryAge_(0.0902, 16.136]      1309 non-null uint8
CategoryAge_(16.136, 32.102]      1309 non-null uint8
CategoryAge_(32.102, 48.068]      1309 non-null uint8
CategoryAge_(48.068, 64.034]      1309 non-null uint8
CategoryAge_(64.034, 80.0]        1309 non-null uint8
CategoryFare_(3.17, 7.925]        1309 non-null uint8
CategoryFare_(7.925, 14.5]        1309 non-null uint8
CategoryFare_(14.5, 31.388]       1309 non-null uint8
CategoryFare_(31.388, 512.329]    1309 non-null uint8
dtypes: category(4), float64(4), int32(4), int64(4), object(3), uint8(23)
memory usage: 178.1+ KB



In [53]:

    
df.head(n=3)









    Out[53]:







  
    
      
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      ...
      Sex_1
      CategoryAge_(0.0902, 16.136]
      CategoryAge_(16.136, 32.102]
      CategoryAge_(32.102, 48.068]
      CategoryAge_(48.068, 64.034]
      CategoryAge_(64.034, 80.0]
      CategoryFare_(3.17, 7.925]
      CategoryFare_(7.925, 14.5]
      CategoryFare_(14.5, 31.388]
      CategoryFare_(31.388, 512.329]
    
    
      PassengerId
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      3
      Braund, Mr. Owen Harris
      1
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
      ...
      1
      0
      1
      0
      0
      0
      1
      0
      0
      0
    
    
      2
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      0
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
    
    
      3
      3
      Heikkinen, Miss. Laina
      0
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
      ...
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
    
  

3 rows × 42 columns



In [54]:

    
X = df.iloc[:len(y)]
Xt = df.iloc[len(y):]
XC = pd.concat([X,y], axis=1)

X = X.drop(columns=['Name', 'Ticket', 'Cabin', 'Title', 'C_Age', 'C_Fare', 'Embarked', 'Age', 'Fare', 'Pclass'])
Xt = Xt.drop(columns=['Name', 'Ticket', 'Cabin', 'Title', 'C_Age', 'C_Fare', 'Embarked', 'Age', 'Fare', 'Pclass'])



In [55]:

    
# survival vs not ...
_ = sns.countplot(x='Survived', data=XC)
plt.show()



In [56]:

    
# survival by sex ...
_ = sns.countplot('Sex', hue='Survived', data=XC)
plt.show()



In [57]:

    
# survival by title ...
_ = sns.countplot('Pclass', hue='Survived', data=XC)
plt.show()



In [58]:

    
# survival by embarked ...
_ = sns.countplot('Embarked', hue='Survived', data=XC)
plt.show()



In [59]:

    
# embarked by title ...
_ = sns.countplot('Embarked', hue='Title', data=XC)
plt.show()



In [60]:

    
# survival by title ...
_ = sns.countplot('Title', hue='Survived', data=XC)
plt.show()



In [61]:

    
# survival by family size ...
_ = sns.countplot('Size', hue='Survived', data=XC)
plt.show()



In [62]:

    
# survival by family size ...
_ = sns.countplot('Size', hue='Title', data=XC)
plt.show()



In [63]:

    
# survival, if you have elders ...
_ = sns.countplot('Parch', hue='Survived', data=XC)
plt.show()



In [64]:

    
# how those with siblings survived ...
_ = sns.countplot('SibSp', hue='Survived', data=XC)
plt.show()



In [65]:

    
# 16 and under vs others survived ...
_ = sns.countplot('IsYoung', hue='Survived', data=XC)
plt.show()



In [66]:

    
# fare ...
_ = plt.subplot(2,1,1)
_ = sns.countplot('C_Fare', hue='Survived', data=XC)

_ = plt.subplot(2,1,2)
_ = sns.countplot('C_Fare', hue='Pclass', data=XC)

plt.tight_layout()
plt.show()



In [67]:

    
# age ...
_ = plt.subplot(2,1,1)
_ = sns.countplot('C_Age', hue='Survived', data=XC)

_ = plt.subplot(2,1,2)
_ = sns.countplot('C_Age', hue='Pclass', data=XC)

plt.tight_layout()
plt.show()



In [68]:

    
_ = plt.hist(XC.loc[XC['Survived'] == 0, 'Age'], alpha=.5, color='red', label='Died')
_ = plt.hist(XC.loc[XC['Survived'] == 1, 'Age'], alpha=.5, color='black', label='Survived')
_ = plt.legend()

plt.show()



In [69]:

    
# age
_ = plt.hist(XC[XC['Survived'] == 0].Age, alpha=.5, label='Died')
_ = plt.hist(XC[XC['Survived'] == 1].Age, alpha=.5, label='Survived')
_ = plt.legend()

plt.tight_layout()
plt.show()



In [70]:

    
np.percentile(XC.Age.values, [0, .25, .5, .75, 100])









    Out[70]:





array([  0.42 ,   0.75 ,   0.83 ,   0.974,  80.   ])



In [71]:

    
from sklearn.cross_validation import KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA



In [72]:

    
# run PCA ...



In [73]:

    
pca = PCA()
pca.fit(X)

features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()

X.columns[:]



In [88]:

    
columns_to_use = X.columns[:10]



In [89]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=11)



In [90]:

    
grid = {
    'C': np.power(10.0, np.arange(-10, 10))
}

# scoring: area under the ROC.
estimator = GridSearchCV(LogisticRegression(max_iter=10000), grid, cv=5, scoring='roc_auc')
estimator.fit(X_train[columns_to_use], y_train)

reg = estimator.best_estimator_
print('best scores:', estimator.best_score_)









    



best scores: 0.8526296762251818



In [91]:

    
proba = reg.predict_proba(X_test[columns_to_use])[:, 1]
pred = reg.predict(X_test[columns_to_use])

_ = plt.plot(proba, y_test, marker='.', linestyle='none', color='blue', alpha=.45)
_ = plt.axvline(.5, color='red', alpha=.5)
_ = plt.xlabel('proba')
_ = plt.ylabel('actual')

plt.show()



In [92]:

    
from sklearn.metrics import roc_curve, auc



In [93]:

    
fpr, tpr, _ = roc_curve(y_test, proba)

_ = plt.plot(fpr, tpr, label='auc = ' + str(auc(fpr,tpr)))
_ = plt.plot([0,1],[0,1], linestyle='--')
_ = plt.legend(loc='lower right')

plt.show()



In [94]:

    
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report



In [95]:

    
print(confusion_matrix(pred, y_test))



In [84]:

    
print(classification_report(pred, y_test))









    



             precision    recall  f1-score   support

          0       0.91      0.88      0.89       184
          1       0.75      0.82      0.78        84

avg / total       0.86      0.86      0.86       268



In [85]:

    
predictions = reg.predict(Xt[columns_to_use])



In [86]:

    
output = pd.DataFrame({
    'PassengerId': Xt.index,
    'Survived': predictions
}).set_index('PassengerId')

#output.to_csv('.\\preds.csv')



In [87]:

    
output.head(n=10)









    Out[87]:







  
    
      
      Survived
    
    
      PassengerId
      
    
  
  
    
      892
      0
    
    
      893
      1
    
    
      894
      0
    
    
      895
      0
    
    
      896
      1
    
    
      897
      0
    
    
      898
      1
    
    
      899
      0
    
    
      900
      1
    
    
      901
      0



In [ ]:

	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
PassengerId
1	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
2	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C