In [1]:
# Imports
import pandas as pd
import numpy as np
# machine learning
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
In [2]:
# 自定義的function
# 算出list各個值的數量
def word_count(data_list):
data_dict = {}
data_dict['nan'] = 0
for item in data_list:
if pd.isnull(item):
data_dict['nan'] += 1
else:
if item not in data_dict:
data_dict[item] = 1
else:
data_dict[item] += 1
return data_dict
# 算 accuracy, precision, recall
def performance(clf, X_train, Y_train, cv_num = 4):
scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='precision')
print "precision is {}".format(scores.mean())
scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='recall')
print "recall is {}".format(scores.mean())
scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='accuracy')
print "accuracy is {}".format(scores.mean())
In [7]:
# get titanic & test csv files as a DataFrame
titanic_df = pd.read_csv("/Users/wy/notebook/kaggle_competitions/titanic/train.csv")
test_df = pd.read_csv("/Users/wy/notebook/kaggle_competitions/titanic/test.csv")
In [8]:
# 稍微看一下 data長怎樣
titanic_df.head()
Out[8]:
In [9]:
titanic_df.info()
print "--------------"
test_df.info()
In [10]:
titanic_df.columns
Out[10]:
In [11]:
# 了解各個欄位的值的分佈情形
print word_count(titanic_df['Survived'].tolist())
print word_count(titanic_df[u'Pclass'].tolist())
print word_count(titanic_df[u'Sex'].tolist())
# print word_count(titanic_df[u'SibSp'].tolist())
# print word_count(titanic_df[u'Parch'].tolist())
# print word_count(titanic_df[u'Embarked'].tolist())
# print word_count(titanic_df[u'Fare'].tolist())
In [12]:
# 結果
# PassengerId 流水編號 無意義
# Survived {0: 549, 1: 342, 'nan': 0}
# Pclass {1: 216, 2: 184, 3: 491, 'nan': 0}
# Name 無意義
# Sex {'female': 314, 'male': 577, 'nan': 0}
# Age 0.42 - 80 'nan':177 要處理
# SibSp {0: 608, 1: 209, 2: 28, 3: 16, 4: 18, 5: 5, 8: 7, 'nan': 0}
# Parch {0: 678, 1: 118, 2: 80, 3: 5, 4: 4, 5: 5, 6: 1, 'nan': 0}
# Ticket 無意義
# Fare 0 - 512
# Cabin nan 過多
# Embarked {'C': 168, 'Q': 77, 'S': 644, 'nan': 2}
# 處理方式
# PassengerId, Name, Ticket 無意義拿掉
# Age 產生隨機年齡遞補
# Embarked nan 2個 填上最多的 S
In [13]:
# 處理 titanic_df
# PassengerId, Name, Ticket 無意義拿掉
titanic_df = titanic_df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
# Embarked nan 2個 填上最多的 S
titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S")
# Age 產生隨機年齡遞補
# get average, std, and number of NaN values in titanic_df
average_age_titanic = titanic_df["Age"].mean()
std_age_titanic = titanic_df["Age"].std()
count_nan_age_titanic = titanic_df["Age"].isnull().sum()
# generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_titanic - std_age_titanic,
average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
titanic_df["Age"][np.isnan(titanic_df["Age"])] = rand_1
In [14]:
# 處理 test_df
# PassengerId, Name, Ticket 無意義拿掉
test_passengerId = test_df["PassengerId"]
test_df = test_df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
# Age 產生隨機年齡遞補
# get average, std, and number of NaN values in titanic_df
average_age_test_titanic = test_df["Age"].mean()
std_age_test_titanic = test_df["Age"].std()
count_nan_age_test_titanic = test_df["Age"].isnull().sum()
# generate random numbers between (mean - std) & (mean + std)
rand_2 = np.random.randint(average_age_test_titanic - std_age_test_titanic,
average_age_test_titanic + std_age_test_titanic, size = count_nan_age_test_titanic)
test_df["Age"][np.isnan(test_df["Age"])] = rand_2
In [15]:
# data processing
# train data
# 把train的答案分開
X_train = titanic_df.drop("Survived",axis=1)
Y_train = titanic_df["Survived"]
# 把Embarked, Sex 的值展開
Embarked_dummies = pd.get_dummies(X_train['Embarked'])
Sex_dummies = pd.get_dummies(X_train['Sex'])
X_train.drop(['Embarked','Sex'], axis=1, inplace=True)
X_train = X_train.join(Embarked_dummies).join(Sex_dummies)
# minmax
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
# test data
# 把Embarked, Sex 的值展開
Embarked_dummies = pd.get_dummies(test_df['Embarked'])
Sex_dummies = pd.get_dummies(test_df['Sex'])
test_df.drop(['Embarked','Sex'], axis=1, inplace=True)
test_df = test_df.join(Embarked_dummies).join(Sex_dummies)
# 處理 test fare的缺失值
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())
# minmax
X_test_minmax = min_max_scaler.fit_transform(test_df)
In [16]:
# 資料處理完成,開始跑模型
# clf
clf1 = RandomForestClassifier(n_estimators=50, max_depth=None,min_samples_split=2, random_state=443)
print "rf"
performance(clf1, X_train_minmax, Y_train)
clf2 = svm.SVC()
print "svm"
performance(clf2, X_train_minmax, Y_train)
clf3 = AdaBoostClassifier(n_estimators=100)
print "Ada"
performance(clf3, X_train_minmax, Y_train)
clf4 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
print "Gbc"
performance(clf4, X_train_minmax, Y_train)
eclf = VotingClassifier(estimators=[('rf', clf1), ('svm', clf2), ('Ada', clf3), ('Gbc', clf4)], voting='hard')
print "eclf"
performance(eclf, X_train_minmax, Y_train)
In [17]:
eclf = eclf.fit(X_train_minmax,Y_train)
In [18]:
test_predict = eclf.predict(X_test_minmax)
submission = pd.DataFrame({
"PassengerId": test_passengerId,
"Survived": test_predict
})
In [61]:
submission.to_csv('/Users/wy/Desktop/titanic2.csv', index=False)