notebook.community

Edit and run



In [2]:

    
import pandas #ipython notebook
titanic = pandas.read_csv("titanic_train.csv")
#titanic.head(3)
print (titanic.describe())









    



       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200



In [3]:

    
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
print titanic.describe()









    



       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.361582    0.523008   
std     257.353842    0.486592    0.836071   13.019697    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   22.000000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   35.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200



In [4]:

    
print titanic["Sex"].unique()

# Replace all the occurences of male with the number 0.
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1









    



['male' 'female']



In [5]:

    
print titanic["Embarked"].unique()
titanic["Embarked"] = titanic["Embarked"].fillna('S')
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2









    



['S' 'C' 'Q' nan]



In [6]:

    
# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold

# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
alg = LinearRegression()
# Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)









    



/Users/jark/anaconda/lib/python2.7/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)



In [7]:

    
import numpy as np

# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print accuracy









    



0.261503928171



In [8]:

    
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
# Initialize our algorithm
alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())









    



0.787878787879



In [45]:

    
titanic_test = pandas.read_csv("test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0 
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2



In [46]:

    
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)

# Take the mean of the scores (because we have one for each fold)
print(scores.mean())









    



0.785634118967



In [47]:

    
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=2)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
kf = cross_validation.KFold(titanic.shape[0], 3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)

# Take the mean of the scores (because we have one for each fold)
print(scores.mean())









    



0.814814814815



In [48]:

    
# Generating a familysize column
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]

# The .apply method generates a new series
titanic["NameLength"] = titanic["Name"].apply(lambda x: len(x))



In [49]:

    
import re

# A function to get the title from a name.
def get_title(name):
    # Use a regular expression to search for a title.  Titles always consist of capital and lowercase letters, and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

# Get all the titles and print how often each one occurs.
titles = titanic["Name"].apply(get_title)
print(pandas.value_counts(titles))

# Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
    titles[titles == k] = v

# Verify that we converted everything.
print(pandas.value_counts(titles))

# Add in the title column.
titanic["Title"] = titles









    



Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Countess      1
Ms            1
Lady          1
Jonkheer      1
Don           1
Mme           1
Capt          1
Sir           1
Name: Name, dtype: int64
1     517
2     183
3     125
4      40
5       7
6       6
7       5
10      3
8       3
9       2
Name: Name, dtype: int64



In [50]:

    
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "NameLength"]

# Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic["Survived"])

# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)

# Plot the scores.  See how "Pclass", "Sex", "Title", and "Fare" are the best?
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()

# Pick only the four best features.
predictors = ["Pclass", "Sex", "Fare", "Title"]

alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)



In [51]:

    
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

# The algorithms we want to ensemble.
# We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier.
algorithms = [
    [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title",]],
    [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
]

# Initialize the cross validation folds
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    train_target = titanic["Survived"].iloc[train]
    full_test_predictions = []
    # Make predictions for each algorithm on each fold
    for alg, predictors in algorithms:
        # Fit the algorithm on the training data.
        alg.fit(titanic[predictors].iloc[train,:], train_target)
        # Select and predict on the test fold.  
        # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
        full_test_predictions.append(test_predictions)
    # Use a simple ensembling scheme -- just average the predictions to get the final classification.
    test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2
    # Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction.
    test_predictions[test_predictions <= .5] = 0
    test_predictions[test_predictions > .5] = 1
    predictions.append(test_predictions)

# Put all the predictions together into one array.
predictions = np.concatenate(predictions, axis=0)

# Compute accuracy by comparing to the training data.
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print(accuracy)









    



0.821548821549






    



e:\python27\lib\site-packages\ipykernel\__main__.py:37: FutureWarning: in the future, boolean array-likes will be handled as a boolean array index



In [17]:

    
titles = titanic_test["Name"].apply(get_title)
# We're adding the Dona title to the mapping, because it's in the test set, but not the training set
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2, "Dona": 10}
for k,v in title_mapping.items():
    titles[titles == k] = v
titanic_test["Title"] = titles
# Check the counts of each unique title.
print(pandas.value_counts(titanic_test["Title"]))

# Now, we add the family size column.
titanic_test["FamilySize"] = titanic_test["SibSp"] + titanic_test["Parch"]









    



1     240
2      79
3      72
4      21
7       2
6       2
10      1
5       1
Name: Title, dtype: int64



In [18]:

    
predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title"]

algorithms = [
    [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors],
    [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
]

full_predictions = []
for alg, predictors in algorithms:
    # Fit the algorithm using the full training data.
    alg.fit(titanic[predictors], titanic["Survived"])
    # Predict using the test dataset.  We have to convert all the columns to floats to avoid an error.
    predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:,1]
    full_predictions.append(predictions)

# The gradient boosting classifier generates better predictions, so we weight it higher.
predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4
predictions









    Out[18]:





array([ 0.11682912,  0.47835566,  0.12614824,  0.13098157,  0.52105874,
        0.1435209 ,  0.64085331,  0.18003152,  0.67801353,  0.12111118,
        0.12105181,  0.20902118,  0.91068381,  0.1089127 ,  0.89142102,
        0.87713474,  0.16349859,  0.13907791,  0.54103238,  0.55661006,
        0.22420875,  0.5372079 ,  0.90572223,  0.38890588,  0.88384752,
        0.10357315,  0.90909441,  0.13746454,  0.31046249,  0.12665718,
        0.11663767,  0.18274855,  0.55220994,  0.49648575,  0.42415297,
        0.14191051,  0.50973638,  0.52452209,  0.13270506,  0.28366691,
        0.11145281,  0.46618807,  0.09996501,  0.83420617,  0.89959119,
        0.14983417,  0.31593419,  0.13789623,  0.89104185,  0.54189565,
        0.35666363,  0.17718135,  0.8307195 ,  0.87995521,  0.1755907 ,
        0.13741805,  0.10667279,  0.1234385 ,  0.12099736,  0.91285169,
        0.13099159,  0.15341948,  0.12993967,  0.66573206,  0.66343836,
        0.87272604,  0.67238712,  0.288265  ,  0.35236574,  0.85565507,
        0.6622414 ,  0.12701993,  0.55390065,  0.36740462,  0.91110312,
        0.41201902,  0.13014004,  0.83671279,  0.15614414,  0.6622414 ,
        0.68129213,  0.20605719,  0.20382623,  0.12105181,  0.18486634,
        0.13130212,  0.65680539,  0.53029858,  0.65489631,  0.79881212,
        0.53764546,  0.12104028,  0.8913725 ,  0.13014004,  0.28406245,
        0.12345367,  0.86792484,  0.14666337,  0.58599461,  0.12260781,
        0.90433464,  0.14730817,  0.13789623,  0.12262433,  0.62257491,
        0.13155874,  0.14607753,  0.13789623,  0.13020336,  0.17473033,
        0.14286392,  0.65490316,  0.89528117,  0.67146758,  0.88346017,
        0.13992078,  0.11805064,  0.69612515,  0.36668939,  0.86241698,
        0.87649291,  0.12609327,  0.90276371,  0.12099027,  0.13789623,
        0.56971935,  0.12608181,  0.63733743,  0.13339996,  0.13340574,
        0.12723637,  0.51609607,  0.23921874,  0.10791695,  0.09896737,
        0.12431124,  0.13346495,  0.16214099,  0.52029433,  0.12232635,
        0.20712059,  0.90529649,  0.19747926,  0.16153716,  0.42927593,
        0.10487176,  0.33642492,  0.13518414,  0.46618807,  0.34478758,
        0.91431377,  0.13214999,  0.10690998,  0.48983645,  0.11274825,
        0.12427868,  0.9107016 ,  0.57991631,  0.42927593,  0.51274048,
        0.65489239,  0.57884522,  0.82113381,  0.12096648,  0.28979611,
        0.58587108,  0.30130471,  0.14606803,  0.9025041 ,  0.52257377,
        0.12101884,  0.13299498,  0.12418534,  0.13207486,  0.1319655 ,
        0.8729358 ,  0.87633414,  0.29670328,  0.83389526,  0.85558679,
        0.15614414,  0.33352246,  0.90219082,  0.13789623,  0.91718918,
        0.13603003,  0.85482389,  0.12241402,  0.14217314,  0.13560687,
        0.1348803 ,  0.25547183,  0.49950989,  0.12729496,  0.71980831,
        0.10795469,  0.85516508,  0.58990449,  0.16645668,  0.53980354,
        0.64867969,  0.66329187,  0.60981573,  0.87333314,  0.16322638,
        0.25696649,  0.63083524,  0.16482591,  0.88984707,  0.12346408,
        0.12849653,  0.12097124,  0.24675029,  0.80199995,  0.41248342,
        0.29768148,  0.65492663,  0.21860346,  0.90027407,  0.13014004,
        0.8137002 ,  0.13611142,  0.84275393,  0.12700828,  0.87789288,
        0.59807994,  0.12518087,  0.65489631,  0.11487493,  0.1441311 ,
        0.25075165,  0.89266286,  0.11622683,  0.1379133 ,  0.34224639,
        0.12796773,  0.19365861,  0.14018901,  0.80948189,  0.89790832,
        0.87598967,  0.82598174,  0.33036559,  0.12105101,  0.33258156,
        0.28710745,  0.8790295 ,  0.16058987,  0.86241698,  0.59133092,
        0.74586492,  0.15434326,  0.39647431,  0.13354268,  0.12701864,
        0.12101884,  0.13789623,  0.13014004,  0.83005787,  0.12700585,
        0.10894954,  0.12701508,  0.85003763,  0.64929875,  0.16619539,
        0.12105181,  0.21821016,  0.12101884,  0.50973638,  0.14016481,
        0.34495861,  0.13789623,  0.91564   ,  0.6332826 ,  0.13207439,
        0.85713531,  0.15861636,  0.12500116,  0.14267175,  0.16811853,
        0.52045075,  0.66231856,  0.65489631,  0.64136782,  0.71198852,
        0.10601085,  0.12099027,  0.3627808 ,  0.13207486,  0.13014004,
        0.33304456,  0.59319589,  0.13207486,  0.50584352,  0.12081676,
        0.12263655,  0.77903176,  0.12665718,  0.33024483,  0.12028976,
        0.11813957,  0.17547887,  0.1216941 ,  0.13347145,  0.65489631,
        0.82133626,  0.33497525,  0.67696014,  0.20916505,  0.42575111,
        0.13912869,  0.13799529,  0.12102122,  0.61904744,  0.90111957,
        0.67393647,  0.23919457,  0.17328806,  0.12182854,  0.18522951,
        0.12262433,  0.13491478,  0.16214099,  0.45541306,  0.90601333,
        0.12509883,  0.86563776,  0.34598576,  0.14469719,  0.17034218,
        0.82147627,  0.32823572,  0.13207439,  0.64322911,  0.12183262,
        0.25111398,  0.15333425,  0.09370087,  0.20950803,  0.35411806,
        0.17507148,  0.118123  ,  0.1469565 ,  0.91556464,  0.33657652,
        0.618368  ,  0.16214099,  0.62462682,  0.1654289 ,  0.85157883,
        0.89603825,  0.16322638,  0.24472808,  0.16066609,  0.70031025,
        0.15642457,  0.85672648,  0.12105022,  0.13789623,  0.57255235,
        0.10418822,  0.87672475,  0.86918839,  0.13098157,  0.91914163,
        0.15715004,  0.1313025 ,  0.53322127,  0.89562968,  0.17356053,
        0.15319843,  0.90891499,  0.16307942,  0.13130575,  0.87654859,
        0.90969185,  0.48853359,  0.17002326,  0.19866966,  0.13510974,
        0.13789623,  0.14010265,  0.54133852,  0.5949924 ,  0.15905635,
        0.83276875,  0.12430276,  0.12019388,  0.14606637,  0.18789784,
        0.38579307,  0.87750065,  0.56459193,  0.12807839,  0.10318132,
        0.91169572,  0.14231524,  0.88773179,  0.12607946,  0.12971145,
        0.90753797,  0.12635163,  0.90891637,  0.35988713,  0.30442425,
        0.18966803,  0.1501521 ,  0.26822399,  0.65488945,  0.64585313,
        0.65489631,  0.90711865,  0.56933478,  0.13014004,  0.86010063,
        0.10126674,  0.13014004,  0.41850311])



In [ ]:



In [ ]: