In [1]:
# Imports
import pandas as pd
import numpy as np

# machine learning
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [2]:
# 自定義的function

# 算出list各個值的數量
def word_count(data_list):
    data_dict = {}
    data_dict['nan'] = 0
    for item in data_list:
        if pd.isnull(item):
            data_dict['nan'] += 1
        else:
            if item not in data_dict:
                data_dict[item] = 1
            else:
                data_dict[item] += 1
    return data_dict

# 算 accuracy, precision, recall
def performance(clf, X_train, Y_train, cv_num = 4):
    scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='precision')
    print "precision is {}".format(scores.mean())
    
    scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='recall')
    print "recall is {}".format(scores.mean())

    scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='accuracy')
    print "accuracy is {}".format(scores.mean())

In [7]:
# get titanic & test csv files as a DataFrame
titanic_df = pd.read_csv("/Users/wy/notebook/kaggle_competitions/titanic/train.csv")
test_df = pd.read_csv("/Users/wy/notebook/kaggle_competitions/titanic/test.csv")

In [8]:
# 稍微看一下 data長怎樣
titanic_df.head()


Out[8]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [9]:
titanic_df.info()
print "--------------"
test_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
--------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB

In [10]:
titanic_df.columns


Out[10]:
Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')

In [11]:
# 了解各個欄位的值的分佈情形

print word_count(titanic_df['Survived'].tolist())
print word_count(titanic_df[u'Pclass'].tolist())
print word_count(titanic_df[u'Sex'].tolist())
# print word_count(titanic_df[u'SibSp'].tolist())
# print word_count(titanic_df[u'Parch'].tolist())
# print word_count(titanic_df[u'Embarked'].tolist())
# print word_count(titanic_df[u'Fare'].tolist())


{0: 549, 1: 342, 'nan': 0}
{1: 216, 2: 184, 3: 491, 'nan': 0}
{'male': 577, 'nan': 0, 'female': 314}

In [12]:
# 結果
# PassengerId 流水編號 無意義
# Survived {0: 549, 1: 342, 'nan': 0}
# Pclass {1: 216, 2: 184, 3: 491, 'nan': 0}
# Name 無意義
# Sex {'female': 314, 'male': 577, 'nan': 0}
# Age 0.42 - 80 'nan':177 要處理
# SibSp {0: 608, 1: 209, 2: 28, 3: 16, 4: 18, 5: 5, 8: 7, 'nan': 0}
# Parch {0: 678, 1: 118, 2: 80, 3: 5, 4: 4, 5: 5, 6: 1, 'nan': 0}
# Ticket 無意義
# Fare 0 - 512
# Cabin nan 過多
# Embarked {'C': 168, 'Q': 77, 'S': 644, 'nan': 2}

# 處理方式
# PassengerId, Name, Ticket 無意義拿掉
# Age 產生隨機年齡遞補
# Embarked nan 2個 填上最多的 S

In [13]:
# 處理 titanic_df

# PassengerId, Name, Ticket 無意義拿掉
titanic_df = titanic_df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

# Embarked nan 2個 填上最多的 S
titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S")

# Age 產生隨機年齡遞補
# get average, std, and number of NaN values in titanic_df
average_age_titanic   = titanic_df["Age"].mean()
std_age_titanic       = titanic_df["Age"].std()
count_nan_age_titanic = titanic_df["Age"].isnull().sum()
# generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_titanic - std_age_titanic,
                           average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
titanic_df["Age"][np.isnan(titanic_df["Age"])] = rand_1


/Users/wy/anaconda/envs/condapy2.7/lib/python2.7/site-packages/IPython/kernel/__main__.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [14]:
# 處理 test_df

# PassengerId, Name, Ticket 無意義拿掉
test_passengerId = test_df["PassengerId"]
test_df = test_df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

# Age 產生隨機年齡遞補
# get average, std, and number of NaN values in titanic_df
average_age_test_titanic   = test_df["Age"].mean()
std_age_test_titanic       = test_df["Age"].std()
count_nan_age_test_titanic = test_df["Age"].isnull().sum()

# generate random numbers between (mean - std) & (mean + std)
rand_2 = np.random.randint(average_age_test_titanic - std_age_test_titanic,
                           average_age_test_titanic + std_age_test_titanic, size = count_nan_age_test_titanic)
test_df["Age"][np.isnan(test_df["Age"])] = rand_2


/Users/wy/anaconda/envs/condapy2.7/lib/python2.7/site-packages/IPython/kernel/__main__.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [15]:
# data processing

# train data
# 把train的答案分開
X_train = titanic_df.drop("Survived",axis=1)
Y_train = titanic_df["Survived"]

# 把Embarked, Sex 的值展開
Embarked_dummies = pd.get_dummies(X_train['Embarked'])
Sex_dummies = pd.get_dummies(X_train['Sex'])
X_train.drop(['Embarked','Sex'], axis=1, inplace=True)
X_train = X_train.join(Embarked_dummies).join(Sex_dummies)

# minmax
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)

# test data
# 把Embarked, Sex 的值展開
Embarked_dummies = pd.get_dummies(test_df['Embarked'])
Sex_dummies = pd.get_dummies(test_df['Sex'])
test_df.drop(['Embarked','Sex'], axis=1, inplace=True)
test_df = test_df.join(Embarked_dummies).join(Sex_dummies)

# 處理 test fare的缺失值
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())

# minmax
X_test_minmax = min_max_scaler.fit_transform(test_df)

In [16]:
# 資料處理完成,開始跑模型

# clf
clf1 = RandomForestClassifier(n_estimators=50, max_depth=None,min_samples_split=2, random_state=443)
print "rf"
performance(clf1, X_train_minmax, Y_train)

clf2 = svm.SVC()
print "svm"
performance(clf2, X_train_minmax, Y_train)

clf3 = AdaBoostClassifier(n_estimators=100)
print "Ada"
performance(clf3, X_train_minmax, Y_train)

clf4 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
print "Gbc"
performance(clf4, X_train_minmax, Y_train)

eclf = VotingClassifier(estimators=[('rf', clf1), ('svm', clf2), ('Ada', clf3), ('Gbc', clf4)], voting='hard')
print "eclf"
performance(eclf, X_train_minmax, Y_train)


rf
precision is 0.774514108003
recall is 0.716313269494
accuracy is 0.811460037672
svm
precision is 0.741531165312
recall is 0.680950752394
accuracy is 0.786710647627
Ada
precision is 0.749353497355
recall is 0.722058823529
accuracy is 0.800284354061
Gbc
precision is 0.774379631907
recall is 0.736730506156
accuracy is 0.815994841534
eclf
precision is 0.815167615433
recall is 0.695554035568
accuracy is 0.822650870962

In [17]:
eclf = eclf.fit(X_train_minmax,Y_train)

In [18]:
test_predict = eclf.predict(X_test_minmax)
submission = pd.DataFrame({
        "PassengerId": test_passengerId,
        "Survived": test_predict
    })

In [61]:
submission.to_csv('/Users/wy/Desktop/titanic2.csv', index=False)