Lecture notes on logistic regression: http://george1328.github.io/lecture_notes/Logistic_Regression.pdf
In [56]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import seaborn as sns
%pylab inline
In [57]:
df = pd.read_csv('train.csv', header = 0)
In [58]:
df.describe()
Out[58]:
In [59]:
df.info()
There are missing values in Age, Cabin and Embarked
In [60]:
df.Embarked.describe()
Out[60]:
In [61]:
# seaborn is cool. Here we can the equivalent of a Pandas group by AND plot in one step
sns.boxplot(df.Age, df.Sex)
Out[61]:
In [62]:
# loof at the distribution of ages
sns.kdeplot(df.Age, shade=True)
Out[62]:
In [63]:
sns.boxplot(df.Age, df.Survived)
Out[63]:
In [64]:
# Lets populate 3 missing Embarked values with the mode value
from scipy.stats import mode
mode=mode(df.Embarked)[0][0]
print mode
df.Embarked = df.Embarked.fillna(mode)
In [65]:
#Lets use a temp df to drop missing values. First drop all Cabin NAs, then drop the rest
temp = df.drop(['Cabin'], axis =1)
temp = temp.dropna()
In [66]:
# Here's a way to view data using the Pandas groupby function
grouped = temp.groupby('Pclass')
grouped.get_group(1).Age.hist()
grouped.get_group(2).Age.hist()
grouped.get_group(3).Age.hist()
Out[66]:
In [67]:
#Here we use matplotlip. Notice that this graph tells us that age distribution is different for each class
plt.figure(figsize(5,3))
temp.Age[temp.Pclass == 1].plot(kind='kde')
temp.Age[temp.Pclass == 2].plot(kind='kde')
temp.Age[temp.Pclass == 3].plot(kind='kde')
plt.xlabel("Age")
plt.title("Age by class")
plt.legend(('1st', '2nd','3rd'))
Out[67]:
In [68]:
#Lets get the mean and median ages
meanAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'mean')
print meanAges
medianAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'median')
print medianAges
In [69]:
#Lets plugin median ages for the missing values
df.Age = df[['Age', 'Pclass']].apply(lambda x: int(meanAges[x['Pclass']]) if pd.isnull(x['Age']) else x['Age'], axis =1)
In [70]:
(df.Fare==0).sum()
Out[70]:
In [71]:
medianFare = df.pivot_table('Fare', rows = 'Pclass', aggfunc = 'median')
In [72]:
df.Fare = df[['Fare', 'Pclass']].apply(lambda x: medianFare[x['Pclass']] if x['Fare'] ==0 else x['Fare'], axis=1)
In [73]:
#Now our data is fixed and ready
df.describe()
Out[73]:
In [74]:
#some data re-engineering
df['familySize'] = df.SibSp + df.Parch +1
In [75]:
df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
In [76]:
port_dict = {name: i for i, name in list(enumerate(np.unique(df['Embarked'])))}
df['Port'] = df['Embarked'].map(lambda x: port_dict[x])
In [77]:
#Lets view the data
fig = plt.figure(figsize=(18,9), dpi=1600)
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace =.5, wspace=.15)
alpha=alpha_scatterplot = 0.2
alpha_bar_chart = 0.55
ax1 = plt.subplot2grid((3,3),(0,0))
df.Survived.value_counts().plot(kind='bar', alpha=alpha_bar_chart)
ax1.set_xlim(-1, 2)
ax1.set_xticklabels(["Died", "Survived"], rotation=0)
plt.title("Distribution of Survival")
plt.subplot2grid((3,3),(0,2))
plt.scatter(df.Survived, df.Age, alpha=alpha_scatterplot)
plt.ylabel("Age")
plt.title("Survial by Age")
ax3 = plt.subplot2grid((3,3),(0,1))
df.Pclass.value_counts().plot(kind="bar", alpha=alpha_bar_chart)
ax3.set_xticklabels(["3rd Class", "1st Class", "2nd Class"], rotation=0)
plt.title("Class Distribution")
plt.subplot2grid((3,3),(1,0))
plt.scatter(df.Survived, df.Gender, alpha=alpha_bar_chart)
plt.ylabel("Age")
plt.title("Survial by Gender")
ax4 = plt.subplot2grid((3,3),(1,0))
female1class = df.Survived[df.Sex == 'female'][df.Pclass == 1].value_counts()
female1class.plot(kind='bar', color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Survived", "Died"], rotation=0)
ax4.set_xlim(-1, len(female1class))
plt.title("female 1st class")
ax4 = plt.subplot2grid((3,3),(1,1))
female2class = df.Survived[df.Sex == 'female'][df.Pclass == 2].value_counts()
female2class.plot(kind='bar', color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Survived", "Died"], rotation=0)
ax4.set_xlim(-1, len(female2class))
plt.title("female 2nd class")
ax4 = plt.subplot2grid((3,3),(1,2))
female3class = df.Survived[df.Sex == 'female'][df.Pclass == 3].value_counts()
female3class.plot(kind='bar', color = '#d01265',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(female3class))
plt.title("female 3rd class")
ax4 = plt.subplot2grid((3,3),(2,0))
male1 = df.Survived[df.Sex == 'male'][df.Pclass == 1].value_counts()
male1.plot(kind='bar', color = '#2aa198',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male1))
plt.title("male 1st class")
ax4 = plt.subplot2grid((3,3),(2,1))
male2 = df.Survived[df.Sex == 'male'][df.Pclass == 2].value_counts()
male2.plot(kind='bar', color = '#2aa198',alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male2))
plt.title("male 2nd class")
ax4 = plt.subplot2grid((3,3),(2,2))
male3 = df.Survived[df.Sex == 'male'][df.Pclass == 3].value_counts()
male3.plot(kind='bar', color = '#2aa198', alpha=0.65)
ax4.set_xticklabels(["Died", "Survived"], rotation=0)
ax4.set_xlim(-1, len(male3))
plt.title("male 3rd class")
Out[77]:
In [78]:
features = df[['Gender', 'Pclass','Fare', 'familySize', 'Port' ]].values
target = df['Survived'].values
In [79]:
feature_names = ['Gender', 'Pclass','Fare', 'familySize', 'Port']
model_lr = LogisticRegression(C=1).fit(features, target)
x = np.arange(len(feature_names))
In [80]:
plt.bar(x, model_lr.coef_.ravel())
_ = plt.xticks(x + 0.5, feature_names, rotation=30)
This correlation matrix tells us which features are most influential
In [81]:
from sklearn.cross_validation import KFold
def cross_validate(X, y, classifier, k_fold) :
# derive a set of (random) training and testing indices
k_fold_indices = KFold(len(X), n_folds=k_fold,
shuffle=True, random_state=0)
k_score_total = 0
# for each training and testing slices run the classifier, and score the results
for train_slice, test_slice in k_fold_indices :
model = classifier(X[ train_slice ],
y[ train_slice ])
k_score = model.score(X[ test_slice ],
y[ test_slice ])
k_score_total += k_score
# return the average accuracy
return k_score_total/k_fold
In [82]:
model_lr = LogisticRegression(C=1).fit
cross_validate(features, target, model_lr, 11)
Out[82]:
In [83]:
model_lr = LogisticRegression(C=1).fit(features, target)
y = model_lr.predict(features)
In [84]:
from sklearn import metrics
metrics.precision_score(target, y)
Out[84]:
In [85]:
target_predicted_proba = model_lr.predict_proba(features)
proba = pd.DataFrame(target_predicted_proba[:,1])
proba['class_0_at_80'] = proba[0].apply(lambda x:0 if x<.5 else 1)
metrics.precision_score(target, proba['class_0_at_80'])
Out[85]:
In [86]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
def plot_roc_curve(target_test, target_predicted_proba):
fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1
])
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--') # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
In [87]:
from sklearn.cross_validation import train_test_split
train_feat, test_feat, train_target, test_target = train_test_split(features, target, train_size=0.5)
In [88]:
model_lr = LogisticRegression(C=1).fit(train_feat, train_target)
target_predicted_proba = model_lr.predict_proba(test_feat)
In [89]:
plot_roc_curve(test_target, target_predicted_proba)
In [90]:
model_lr = LogisticRegression(C=1).fit(features, target)
In [91]:
df = pd.read_csv('test.csv', header = 0)
#mode=mode(df.Embarked)[0][0]
df.Embarked = df.Embarked.fillna('S')
meanAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'mean')
medianAges = df.pivot_table('Age', rows = 'Pclass', aggfunc = 'median')
df.Age = df[['Age', 'Pclass']].apply(lambda x: int(medianAges[x['Pclass']]) if pd.isnull(x['Age']) else x['Age'], axis =1)
medianFare = df.pivot_table('Fare', rows = 'Pclass', aggfunc = 'median')
df.Fare = df[['Fare', 'Pclass']].apply(lambda x: medianFare[x['Pclass']] if x['Fare'] ==0 else x['Fare'], axis=1)
df.Fare = df.Fare.fillna(13) # one fare value slipped through the cracks. must investigate
df['familySize'] = df.SibSp + df.Parch +1
df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
port_dict = {name: i for i, name in list(enumerate(np.unique(df['Embarked'])))}
df['Port'] = df['Embarked'].map(lambda x: port_dict[x])
features = df[['familySize', 'Fare', 'Pclass', 'Gender', 'Port']].values
In [92]:
y = model_lr.predict(features)
result = pd.DataFrame(df['PassengerId'])
result['Survived'] = y
result.to_csv('titanic_results', sep = ',', index = None)
Lets set our threshold to different values
In [93]:
target_predicted_proba = model_lr.predict_proba(features)
proba = pd.DataFrame(target_predicted_proba[:,1])
proba['class_0_at_58'] = proba[0].apply(lambda x:0 if x<.58 else 1)
result = pd.DataFrame(df['PassengerId'])
result['Survived'] = proba['class_0_at_58']
result.to_csv('titanic_results_with_threshold', sep = ',', index = None)