In [3619]:
%run /Users/excalibur/py/scripts/data_wrangling.py functions
%matplotlib inline
In [3620]:
%ls data
In [3621]:
train_df = load_data('data/train_new.csv')
In [3622]:
#explore(train_df)
In [3623]:
submission_df = load_data('data/test_new.csv')
In [3624]:
#explore(submission_df)
In [3625]:
#train_df.plot()
In [3626]:
#train_df.plot(kind='box')
In [3627]:
#train_df['Age'].plot()
In [3628]:
#train_df['Fare'].plot(kind='hist', alpha=0.7, bins=20)
In [3629]:
#train_df.hist(figsize=(10, 8))
#plt.show()
In [3630]:
#sns.pairplot(train_df[['Age', 'Embarked', 'Fare', 'Survived']], hue="Survived")
#plt.show()
In [3631]:
#train_df[train_df.loc[:,'Survived'] == 0]['Sex'].value_counts()
In [3632]:
#sns.factorplot("Survived", hue='Sex', data=train_df)
#plt.show()
In [3633]:
#train_df[train_df.loc[:,'Survived'] == 0]['Pclass'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Pclass'].hist(color='green', alpha=0.7)
#plt.show()
In [3634]:
#train_df[train_df.loc[:,'Survived'] == 0]['Sex'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Sex'].hist(color='green', alpha=0.7)
#plt.show()
In [3635]:
#train_df[train_df.loc[:,'Survived'] == 0]['Age'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Age'].hist(color='green', alpha=0.7)
#plt.show()
In [3636]:
#train_df[train_df.loc[:,'Survived'] == 0]['SibSp'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['SibSp'].hist(color='green', alpha=0.7)
#plt.show()
In [3637]:
#train_df[train_df.loc[:,'Survived'] == 0]['Parch'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Parch'].hist(color='green', alpha=0.7)
#plt.show()
In [3638]:
#train_df[train_df.loc[:,'Survived'] == 0]['Fare'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Fare'].hist(color='green', alpha=0.7)
#plt.show()
In [3639]:
#train_df[train_df.loc[:,'Survived'] == 0]['Embarked'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Embarked'].hist(color='green', alpha=0.7)
#plt.show()
In [3640]:
#plt.scatter(train_df[train_df.loc[:,'Survived'] == 0]['Age'], train_df[train_df.loc[:,'Survived'] == 0]['Fare'], alpha=0.5, color='orange')
#plt.scatter(train_df[train_df.loc[:,'Survived'] == 1]['Age'], train_df[train_df.loc[:,'Survived'] == 1]['Fare'], alpha=0.3, color='blue')
#plt.show()
In [3641]:
# based on histograms, drop Age, SibSp, Parch, Embarked
#cols_to_drop = ['Age', 'SibSp', 'Parch', 'Embarked']
#train_df = train_df.drop(cols_to_drop, axis=1)
#submission_df = submission_df.drop(cols_to_drop, axis=1)
In [3642]:
train_df['ClassFare'] = train_df['Pclass'] * train_df['Fare']
submission_df['ClassFare'] = submission_df['Pclass'] * submission_df['Fare']
In [3643]:
train_y = train_df['Survived']
train_df = train_df.drop('Survived', axis=1)
In [3644]:
train_X, test_X, train_y, test_y = train_test_split(train_df, train_y, test_size=0.20, random_state=1)
In [3645]:
clf = RandomForestClassifier(n_estimators = 300) # 10, 30, 100, 300
clf = clf.fit(train_X,train_y)
In [3646]:
clf.score(test_X,test_y)
# BEST LOCAL SCORE AFTER STARTED USING train_test_split and numericalizing everything: 0.86033519553072624
# WHEN LOCAL: 0.86033519553072624, KAGGLE WAS: 0.77990
# BEST KAGGLE: 0.78469, WHEN LOCAL WAS: 0.85474860335195535
Out[3646]:
In [3658]:
clf = linear_model.LogisticRegression()
clf = clf.fit(train_X, train_y)
In [3659]:
clf.score(test_X,test_y)
# BEST KAGGLE: 0.76555, WHEN LOCAL WAS: 0.81005586592178769
Out[3659]:
In [3649]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf = clf.fit(train_X, train_y)
clf.score(test_X,test_y)
Out[3649]:
In [3660]:
# simply use this to get submission dataframe
submission_y_df = load_data('data/genderclassmodel.csv')
In [3661]:
submission_y_df['Survived'] = clf.predict(submission_df)
In [3662]:
submission_y_df.to_csv('data/predictions.csv', index=False)
In [3653]:
#test_pred = clf.predict(submission_df) # get predictions
#clf.score(submission_df,submission_y['Survived'])
### THESE LOCAL SCORES MEAN NOTHING BECAUSE I WAS COMPARING TO THE GENDERCLASSMODEL
#BEST LOCAL (when using all test data locally): 0.88995215311004783
#BEST LOCAL (when using all test data locally): 1.0, which is weird, so...
#BEST KAGGLE (when using all test data locally): 0.76555
In [3654]:
#submission_y = load_data('data/gendermodel.csv')
#submission_y = load_data('data/genderclassmodel.csv') # simply use this to get submission dataframe
# GENDERCLASSMODEL scores 0.77990 on KAGGLE
In [3655]:
#indices = np.random.choice(submission_y.shape[0], size=(submission_y.shape[0]/2), replace=False)
#test_pred = clf.predict(submission_df.loc[indices,:])
#clf.score(submission_df.loc[indices,:],submission_y.loc[indices,:]['Survived'])
### THESE LOCAL SCORES MEAN NOTHING BECAUSE I WAS COMPARING TO THE GENDERCLASSMODEL
#BEST LOCAL (when using 50% of test data locally): 0.91866028708133973
#BEST LOCAL (when using 50% of test data locally): 1.0, which is weird, so...
#BEST KAGGLE (when using 50% of test data locally): 0.77990
In [3656]:
#pred_df = submission_y
#pred_df['Survived'] = clf.predict(submission_df)
#pred_df['Survived'] = test_pred
#pred_df.head(1)
#BEST KAGGLE: 0.77990
In [3657]:
#pred_df.to_csv('data/predictions.csv', index=False)