In [2]:
import numpy as np
import pandas as pd
filepath = '/Users/mac/Desktop/Kaggle_datasets/20170824_Titanic/'
filename1 = 'train.csv'
filename2 = 'test.csv'
dfTrain = pd.read_csv(filepath + filename1)
dfTest = pd.read_csv(filepath + filename2)
In [3]:
dfTrain.info()
In [4]:
dfTest.info()
In [5]:
#使用id當index也方便之後區分train和test data
dfTrain.set_index(['PassengerId'],inplace=True)
dfTest.set_index(['PassengerId'],inplace=True)
In [6]:
import matplotlib.pyplot as plt
%matplotlib inline
In [7]:
#這招必學超屌!! x欄位grouby對應y軸的數學計算,一氣呵成!!
dfTrain.groupby('Pclass').Survived.mean().plot(kind='bar')
Out[7]:
In [15]:
dfTrain.groupby('Pclass').Survived.mean()
Out[15]:
In [16]:
plt.bar([1,2,3], dfTrain.groupby('Pclass').Survived.mean(), width = 0.5)
#自己熟悉的畫法,但是你的group會很痛苦
Out[16]:
In [17]:
dfTrain['Pclass'].value_counts().plot(kind='bar') #學起來了,太屌啦!!
Out[17]:
Scikit-learn的DecisionTreeClassifier運作方式:
設定X_train,標準答案y-->再設定X_test, 跑模型 DecisionTreeClassifier().fit(X_train,y)
結果用 DecisionTreeClassifier().predict(X_test) 顯示
In [13]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
X_train = dfTrain[['Pclass']]
y = dfTrain['Survived']
X_test = dfTest[['Pclass']]
dtree.fit(X_train,y)
prediction = dtree.predict(X_test)
dfPrediction = pd.DataFrame(data=prediction, index = dfTest.index.values, columns=['Survived'])
contentTestPredObject1 = dfPrediction.to_csv()
#print(contentTestPredObject1)
In [20]:
dfPrediction.head()
Out[20]:
In [22]:
dfTrain.groupby('Sex').Survived.mean().plot(kind='bar', width = 0.3)
Out[22]:
seaborn初體驗
In [23]:
import seaborn as sns
sns.factorplot("Sex", "Survived", hue="Pclass", data=dfTrain) #hue可以加入第三個factor,超帥!!
Out[23]:
In [24]:
#學習連接方法concat往下補,然後把male/female變成int
dfFull = pd.concat([dfTrain,dfTest])
dfFull['Sex'] = dfFull['Sex'].map({'male': 0, 'female': 1}).astype(int)
dfTrain = dfFull.loc[1:891,:]
dfTest = dfFull.loc[892:,:]
In [25]:
dtree = DecisionTreeClassifier()
X_train = dfTrain[['Pclass','Sex']]
y = dfTrain['Survived']
X_test = dfTest[['Pclass','Sex']]
dtree.fit(X_train,y)
prediction = dtree.predict(X_test)
dfPrediction = pd.DataFrame(data=prediction,index = dfTest.index.values,columns=['Survived'])
contentTestPredObject1 = dfPrediction.to_csv()
#print(contentTestPredObject1)
In [26]:
dfPrediction.head()
Out[26]:
In [27]:
dfTrain.groupby('Parch').Survived.mean().plot(kind='bar')
Out[27]:
In [28]:
dfFull.Parch.value_counts()
Out[28]:
In [31]:
dfFull['ParchCat'] = dfFull.Parch.copy().astype(int) #先複製一次原本的欄位創建新欄位
dfFull.loc[dfFull.Parch > 2,'ParchCat'] = 3 #把大於3的都分成一類,數量多才可以有統計意義
dfTrain = dfFull.loc[1:891,:]
dfTest = dfFull.loc[892:,:]
In [32]:
dfTrain.groupby('ParchCat').Survived.mean().plot(kind='bar', width=0.3)
Out[32]:
In [33]:
dfTrain.groupby('SibSp').Survived.mean().plot(kind='bar')
Out[33]:
In [34]:
dfFull['SibSpCat'] = dfFull.SibSp.copy().astype(int)
dfFull.loc[dfFull.SibSp > 2,'SibSpCat'] = 3
dfTrain = dfFull.loc[1:891,:]
dfTest = dfFull.loc[892:,:]
In [36]:
dfTrain.groupby('SibSpCat').Survived.mean().plot(kind='bar')
Out[36]:
In [37]:
dtree = DecisionTreeClassifier()
X_train = dfTrain[['Pclass','Sex','ParchCat','SibSpCat']]
y = dfTrain['Survived']
X_test = dfTest[['Pclass','Sex','ParchCat','SibSpCat']]
dtree.fit(X_train,y)
prediction = dtree.predict(X_test)
dfPrediction = pd.DataFrame(data=prediction,index = dfTest.index.values,columns=['Survived'])
contentTestPredObject1 = dfPrediction.to_csv()
#print(contentTestPredObject1) #結果不好
In [38]:
dfTrain.groupby('Embarked').Survived.mean().plot(kind='bar')
Out[38]:
In [40]:
#竟然可以雙變數分析+統計數量一次到位!!
dfFull.groupby(['Embarked','Sex']).Name.count()
Out[40]:
In [41]:
dfFull.groupby(['Embarked','Pclass']).Name.count()
Out[41]:
In [42]:
dfFull[dfFull.Embarked.isnull()] #直接以null index列出那些人
Out[42]:
In [45]:
indexEmbarked = dfFull[dfFull.Embarked.isnull()].index.tolist() #取得null的index,變成list
#利用相似乘客特性預測他們的embark port!!
for indEmb in indexEmbarked:
fareEmbarked = dfFull.loc[indEmb].Fare.mean()
predictedEmbarked = dfFull[(dfFull.Fare < fareEmbarked*1.1) &
(dfFull.Fare > fareEmbarked*0.9) &
(dfFull.Pclass == dfFull.loc[indEmb].Pclass)].Embarked.mode()
dfFull.loc[indEmb,'Embarked'] = predictedEmbarked[0]
print(predictedEmbarked)
In [61]:
dfFull.loc[[62,630]] #都填滿了
Out[61]:
In [51]:
dfFull.info()
In [54]:
#然後把str變成int
dfFull['Embarked'] = dfFull['Embarked'].map({'S': 0, 'Q': 1, 'C': 2}).astype(int)
dfTrain = dfFull.loc[1:891,:]
dfTest = dfFull.loc[892:,:]
In [55]:
dtree = DecisionTreeClassifier()
X_train = dfTrain[['Pclass','Sex','ParchCat','Embarked']]
y = dfTrain['Survived']
X_test = dfTest[['Pclass','Sex','ParchCat','Embarked']]
dtree.fit(X_train,y)
prediction = dtree.predict(X_test)
dfPrediction = pd.DataFrame(data=prediction,index = dfTest.index.values,columns=['Survived'])
contentTestPredObject1 = dfPrediction.to_csv()
#print(contentTestPredObject1)
In [56]:
nullFares = dfFull[dfFull.Fare.isnull()].index.values
dfFull.loc[nullFares]
Out[56]:
In [57]:
#利用相似乘客特性預測他的fare
dfFull.loc[nullFares,'Fare']
dfFull.loc[nullFares,'Fare'] = dfFull[(dfFull.ParchCat == 0) &
(dfFull.Pclass ==3 ) &
(dfFull.Embarked == 0)].Fare.mean()
dfFull.loc[[1044]]
Out[57]:
In [50]:
import matplotlib.pyplot as plt
plt.hist([dfTrain[dfTrain['Survived']==1]['Fare'],
dfTrain[dfTrain['Survived']==0]['Fare']],
stacked = True, #使他們可以疊在一起
color = ['g','r'],
bins = 10,
label = ['Survived','Not Survived'])
plt.legend()
plt.ylabel('No. of Passengers')
plt.xlabel('Fare')
Out[50]:
In [62]:
plt.hist([dfTrain[dfTrain['Survived']==1]['Fare'],
dfTrain[dfTrain['Survived']==0]['Fare']],
stacked = False, #使他們不疊在一起
color = ['g','r'],
bins = 10,
label = ['Survived','Not Survived'])
plt.legend()
plt.ylabel('No. of Passengers')
plt.xlabel('Fare')
Out[62]:
In [63]:
#Fare大於平均都當1,小於平均都當0,統計數量Fare(把連續數據變成二元)
fareMean = dfFull.Fare.mean()
dfFull.loc[dfFull.Fare <= fareMean,'Fare']=0
dfFull.loc[dfFull.Fare > fareMean,'Fare']=1
dfFull.Fare.value_counts()
Out[63]:
In [64]:
dtree = DecisionTreeClassifier()
X_train = dfTrain[['Pclass','Sex','ParchCat','Embarked','Fare']]
y = dfTrain['Survived']
X_test = dfTest[['Pclass','Sex','ParchCat','Embarked','Fare']]
dtree.fit(X_train,y)
prediction = dtree.predict(X_test)
dfPrediction = pd.DataFrame(data=prediction,index = dfTest.index.values,columns=['Survived'])
contentTestPredObject1 = dfPrediction.to_csv()
#print(contentTestPredObject1)
In [65]:
dfFull.Cabin.value_counts(dropna=False) #雜亂無章
Out[65]:
In [66]:
#使用第一個字母分類
dfFull.Cabin.str[0].value_counts(dropna=False)
Out[66]:
In [69]:
#把NA還有少量的GT都變成Z
#然後把str變成int
dfFull['CabinCat'] = dfFull.Cabin.str[0].fillna('Z')
dfFull.loc[dfFull.CabinCat=='G','CabinCat']= 'Z'
dfFull.loc[dfFull.CabinCat=='T','CabinCat']= 'Z'
dfFull['CabinCat'] = dfFull['CabinCat'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'Z': 6}).astype(int)
dfTrain = dfFull.loc[1:891,:]
dfTest = dfFull.loc[892:,:]
In [70]:
dfTrain.groupby('CabinCat').Survived.mean().plot(kind='bar')
Out[70]:
In [71]:
dtree = DecisionTreeClassifier()
X_train = dfTrain[['Pclass','Sex','ParchCat','Embarked','Fare','CabinCat']]
X_train = pd.get_dummies(X_train)
y = dfTrain['Survived']
X_test = dfTest[['Pclass','Sex','ParchCat','Embarked','Fare','CabinCat']]
X_test = pd.get_dummies(X_test)
dtree.fit(X_train,y)
prediction = dtree.predict(X_test)
dfPrediction = pd.DataFrame(data=prediction,index = dfTest.index.values,columns=['Survived'])
contentTestPredObject1 = dfPrediction.to_csv()
#print(contentTestPredObject1)
In [72]:
#只取頭銜,使用regex
dfFull['Title'] = dfFull.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
dfFull.Title.value_counts()
Out[72]:
In [73]:
#把小分類全部都變成一群,給予數字標號
dfFull['TitleCat']=dfFull['Title']
dfFull.TitleCat.replace(to_replace=['Rev','Dr','Col','Major','Mlle','Ms','Countess','Capt', \
'Dona','Don','Sir','Lady','Jonkheer','Mme'], value=0, inplace=True)
dfFull.TitleCat.replace('Mr',1,inplace=True)
dfFull.TitleCat.replace('Miss',2,inplace=True)
dfFull.TitleCat.replace('Mrs',3,inplace=True)
dfFull.TitleCat.replace('Master',4,inplace=True)
dfFull.TitleCat.value_counts(dropna=False)
Out[73]:
In [74]:
#尋找頭銜和其他的相關係數
dfFull.corr().TitleCat
Out[74]:
In [76]:
dfTrain = dfFull.loc[1:891,:]
dfTest = dfFull.loc[892:,:]
dtree = DecisionTreeClassifier()
X_train = dfFull[dfFull.TitleCat!=0][['Sex','ParchCat','SibSpCat']]
y = dfFull[dfFull.TitleCat!=0]['TitleCat']
X_test = dfFull[dfFull.TitleCat==0][['Sex','ParchCat','SibSpCat']]
dtree.fit(X_train,y)
prediction = dtree.predict(X_test)
dfPrediction = pd.DataFrame(data=prediction,index = X_test.index.values,columns=['TitleCat'])
#print(dfPrediction)
In [77]:
dfFull.TitleCat
dfFull.update(dfPrediction)
In [78]:
dfFull.loc[dfPrediction.index,['TitleCat','Title','Sex','SibSpCat','ParchCat']]
Out[78]:
In [79]:
dfTrain = dfFull.loc[1:891,:]
dfTest = dfFull.loc[892:,:]
dtree = DecisionTreeClassifier()
X_train = dfTrain[['Pclass','Sex','ParchCat','Embarked','Fare','TitleCat']]
y = dfTrain['Survived']
X_test = dfTest[['Pclass','Sex','ParchCat','Embarked','Fare','TitleCat']]
dtree.fit(X_train,y)
prediction = dtree.predict(X_test)
dfPrediction = pd.DataFrame(data=prediction,index = dfTest.index.values,columns=['Survived'])
contentTestPredObject1 = dfPrediction.to_csv()
#print(contentTestPredObject1)
In [80]:
dfFull['Age'] = dfFull.Age.round()
In [81]:
dfFull.corr().Age #因為age缺失很多,所以要用已有資料去類比
Out[81]:
In [82]:
X_train = dfFull[dfFull.Age.notnull()][['Pclass','SibSp','CabinCat','TitleCat']]
X_test = dfFull[dfFull.Age.isnull()][['Pclass','SibSp','CabinCat','TitleCat']]
y = dfFull.Age.dropna()
In [84]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y)
prediction = dtree.predict(X_test)
agePrediction = pd.DataFrame(data=prediction,index=X_test.index.values,columns=['Age'])
dfFull = dfFull.combine_first(agePrediction) #可以把兩個data合併,把null填起來
In [85]:
dfFull.Age.isnull().sum()
Out[85]:
In [86]:
dfFull['ageBins'] = pd.cut(dfFull['Age'],list(range(0,80,5))) #幹超屌的直接pf.cut切出新欄位的分層!!
In [88]:
sns.factorplot("ageBins", "Survived", hue="Sex", data=dfFull.loc[1:891,:],size=10)
Out[88]:
In [89]:
dfFull.loc[1:891,:].groupby(['Sex','ageBins']).Name.count()
Out[89]:
In [90]:
dfFull.loc[dfFull.Age <11,'Age'] = 0
dfFull.loc[(dfFull.Age >=10),'Age'] = 1
dfFull.Age.value_counts()
Out[90]:
In [91]:
dfTrain = dfFull.loc[1:891,:]
dfTest = dfFull.loc[892:,:]
dtree = DecisionTreeClassifier()
X_train = dfTrain[['Pclass','Sex','ParchCat','Embarked','Fare','TitleCat','Age']]
y = dfTrain['Survived']
X_test = dfTest[['Pclass','Sex','ParchCat','Embarked','Fare','TitleCat','Age']]
dtree.fit(X_train,y)
prediction = dtree.predict(X_test)
dfPrediction = pd.DataFrame(data=prediction,index = dfTest.index.values,columns=['Survived'])
contentTestPredObject1 = dfPrediction.to_csv()
#print(contentTestPredObject1)
In [92]:
dfTicket = dfFull.Ticket.value_counts()
dfTicket.head()
Out[92]:
In [93]:
lstTicket = dfTicket.loc[dfTicket > 1].index.tolist() #多張票屬於一個人
lstTicketSingle = dfTicket.loc[dfTicket == 1].index.tolist() #單身票
In [94]:
len(lstTicket)
Out[94]:
In [95]:
len(lstTicketSingle)
Out[95]:
In [96]:
dfFull[dfFull.Ticket=='347082'].Name
Out[96]:
In [97]:
dfFull['TicketCat'] = dfFull['Ticket'].copy()
In [98]:
#家族票給1,單身票給0
i=1
for ticket in lstTicket:
dfFull.loc[dfFull.Ticket == ticket, 'TicketCat'] = i
i+=1
for ticket in lstTicketSingle:
dfFull.loc[dfFull.Ticket == ticket, 'TicketCat'] = 0
In [99]:
dfTrain = dfFull.loc[1:891,:]
dfTest = dfFull.loc[892:,:]
dtree = DecisionTreeClassifier()
X_train = dfTrain[['Pclass','Sex','ParchCat','Embarked','Fare','TitleCat','Age','TicketCat']]
y = dfTrain['Survived']
X_test = dfTest[['Pclass','Sex','ParchCat','Embarked','Fare','TitleCat','Age','TicketCat']]
dtree.fit(X_train,y)
prediction = dtree.predict(X_test)
dfPrediction = pd.DataFrame(data=prediction,index = dfTest.index.values,columns=['Survived'])
contentTestPredObject1 = dfPrediction.to_csv()
#print(contentTestPredObject1)
In [100]:
from sklearn.model_selection import train_test_split
X_train = dfTrain[['Pclass','Sex','ParchCat','Embarked','Fare','TitleCat','Age','TicketCat']]
X_test = dfTest[['Pclass','Sex','ParchCat','Embarked','Fare','TitleCat','Age','TicketCat']]
y = dfTrain['Survived']
X_NewTrain, X_NewTest,y_NewTrain, y_NewTest = train_test_split(X_train, y,
test_size=0.33, # 1/3 test; 2/3 train
random_state=1410) # seed,這樣才會都使用同一組train_data
In [101]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
classifiers = [
KNeighborsClassifier(),
SVC(),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LogisticRegression(),
LinearSVC()]
for clf in classifiers:
name = clf.__class__.__name__
clf.fit(X_NewTrain, y_NewTrain)
prediction = clf.predict(X_NewTest)
rank = pd.DataFrame(data = np.column_stack([prediction, y_NewTest]), #column_stack可以把兩list並排(像zip)!!
index = X_NewTest.index.values, columns=['Predicted','Real'])
accurracy = np.sum(rank.Predicted.values == rank.Real.values) #把預測結果==答案的部分加起來
accurracy = accurracy/len(y_NewTest)
print(accurracy, name)
In [ ]: