Import libraries.
In [191]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
%matplotlib inline
Load training data and test data.
In [192]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')
Well, I don't really have any idea how to handle these data. So let's just take a look at them. Let's start from the trainning data.
In [193]:
df_train.head()
Out[193]:
In [194]:
df_train.describe()
Out[194]:
In [195]:
df_train.info()
In [196]:
df_train.isnull().sum()
Out[196]:
Hmm... There are some data missing. Age could be an important feature. Cabin seems like a useless feature and I am going to discard it. Well, my 1st question, how do you decide which feature to be used and which not?
After i read other people's analysis, they show me this:
In [197]:
df_train.describe(include=['O'])
Out[197]:
Hmm... Seems some people share one cabin. Is it the case that people in one cabin help each other and increase the survive chance? But the cabin has too less data. Also, the ticket number is shared by upto 7 people, which means they are a group? And they will more likely help each other and increase the survive chance?
Among 891 row, 577 are Male and 314 Female.
Now, do the same thing to the test data.
In [198]:
df_test.head()
Out[198]:
In [199]:
df_test.describe()
Out[199]:
In [200]:
df_test.describe(include=['O'])
Out[200]:
In [ ]:
In [201]:
sns.countplot(x='Survived', data=df_train)
plt.show()
In [202]:
df_train['Percentage'] = 1 # this is a helper colume
df_train[['Percentage','Survived']].groupby('Survived').count().apply(lambda x: (100 * x)/x.sum())
Out[202]:
In [203]:
df_train[['Pclass','Survived']].groupby('Pclass').mean()
Out[203]:
In [204]:
df_train['Count'] = 1 # this is a helper colume
df_train[['Pclass','Survived','Count']].groupby(['Pclass','Survived']).count()
Out[204]:
In [205]:
df_train[['Sex','Survived']].groupby('Sex').mean()
Out[205]:
In [206]:
df_train[['Sex','Survived','Count']].groupby(['Sex','Survived']).count()
Out[206]:
In [207]:
df_train[['Pclass','Sex','Survived','Count']].groupby(['Pclass','Sex','Survived']).count()
Out[207]:
In [208]:
df_train[['Pclass','Sex','Survived']].groupby(['Pclass','Sex']).mean()
Out[208]:
The female survive rate in Pclass 1 and 2 are similar, but Pclass 3 is way lower. Well, the story is the gate from Pclass 3 to the deck was locked at the very beginning. That's sad...
The male survive rate in Pclass 2 and 3 are similar, but Pclass 1 is way higher.
In [209]:
sns.boxplot(x='Survived', y='Age', hue='Sex',data=df_train, palette="coolwarm")
plt.show()
In [210]:
def SimplyAge(colage):
colage = colage.fillna(-1)
bins = (-2,0,5,10,20,35,60,100)
colage = pd.cut(colage,bins)
return colage
colage = SimplyAge(df_train['Age'])
# for test
df_train['Age'] = colage
dfage = df_train
#dfage = pd.DataFrame()
#dfage['Age'] = colage
#dfage['Survived'] = df_train['Survived']
In [211]:
df_train[['Age','Survived','Count']].groupby(['Age','Survived']).count()
Out[211]:
In [212]:
df_train[['Age','Survived']].groupby('Age').mean()
Out[212]:
Well, the babys look have highest survive rate.
In [213]:
df_train[['Age','Survived','Sex','Count']].groupby(['Age','Sex','Survived']).count()
Out[213]:
In [214]:
df_train[['Age','Sex','Survived']].groupby(['Age','Sex']).mean()
Out[214]:
In [215]:
#sns.countplot(x='Age',hue='Survived',data=dfage)
sns.countplot(x='Age',data=dfage,color='Red')
sns.barplot(x='Age',y='Survived',data=dfage,estimator=np.sum,color='Blue')
Out[215]:
In [216]:
sns.barplot(x='Age',y='Count',hue='Survived',data=dfage,estimator=np.sum)
Out[216]:
In [217]:
sns.barplot(x='Pclass', y='Survived', hue='Sex',data=df_train,estimator=np.sum)
plt.show()
I am going to remove other columns which have NA, just for a quick dirty test.
In [218]:
del df_train['Cabin']
del df_train['Embarked']
del df_train['Ticket']
del df_train['Name']
del df_train['PassengerId']
# remvoe the helper columns
del df_train['Count']
del df_train['Percentage']
df_train.columns
Out[218]:
In [219]:
df_train['Sex'] = df_train['Sex'].map({'female':1,'male':0}).astype(int)
In [220]:
df_train['Age'] = df_train['Age'].map({'(-2, 0]':-1,'(0, 5]':1,'(5, 10]':2,'(10, 20]':3,'(20, 35]':4,'(35, 60]':5,'(60, 100]':6}).astype(int)
In [221]:
df_train.head()
Out[221]:
In [222]:
x_train = df_train.drop("Survived",axis=1)
y_train = df_train['Survived']
In [223]:
logreg = LogisticRegression()
logreg.fit(x_train,y_train)
logreg.score(x_train, y_train)
Out[223]:
Now I am going to predict in the test data.
In [224]:
del df_test['Cabin']
del df_test['Embarked']
del df_test['Ticket']
del df_test['Name']
test_id = df_test['PassengerId']
del df_test['PassengerId']
# Sex
df_test['Sex'] = df_test['Sex'].map({'female':1,'male':0}).astype(int)
# Age
colage = SimplyAge(df_test['Age'])
df_test['Age'] = colage
df_test['Age'] = df_test['Age'].map({'(-2, 0]':-1,'(0, 5]':1,'(5, 10]':2,'(10, 20]':3,'(20, 35]':4,'(35, 60]':5,'(60, 100]':6}).astype(int)
df_test.head()
Out[224]:
In [225]:
df_test.info()
In [226]:
df_test[df_test['Fare'].isnull()]
Out[226]:
Well, the NaN in Fare is a Pclass 3, so i am going fill it with the mean of Fare of Pclass 3.
In [227]:
df_test[df_test['Pclass']==3]['Fare'].mean()
Out[227]:
In [228]:
df_test['Fare'] = df_test['Fare'].fillna(df_test[df_test['Pclass']==3]['Fare'].mean())
In [229]:
Y_pred = logreg.predict(df_test)
In [232]:
ypred = pd.DataFrame()
ypred['PassengerId'] = test_id
ypred['Survived'] = Y_pred
ypred
Out[232]:
In [236]:
ypred.to_csv("1st_pred.csv",index=False)