notebook.community

Edit and run



In [3619]:

    
%run /Users/excalibur/py/scripts/data_wrangling.py functions
%matplotlib inline









    



['load_data(filename)', 'explore(df)', 'locate_null(df, col_name)', 'swap_null(df, col_name, value)', 'numericalize(df, col_names)', 'drop_obj_cols(df)']



In [3620]:

    
%ls data









    



genderclassmodel.csv  myfirstforest.py      train.csv
genderclassmodel.py   predictions.csv       train_new.csv
gendermodel.csv       test.csv
gendermodel.py        test_new.csv



In [3621]:

    
train_df = load_data('data/train_new.csv')









    



SHAPE: (891, 6)

HEAD:
   Survived  Pclass  Sex  Age     Fare  Cabin
0         0       3    1   22   7.2500      0
1         1       1    2   38  71.2833      1
2         1       3    2   26   7.9250      0



In [3622]:

    
#explore(train_df)



In [3623]:

    
submission_df = load_data('data/test_new.csv')









    



SHAPE: (418, 5)

HEAD:
   Pclass  Sex   Age    Fare  Cabin
0       3    1  34.5  7.8292      0
1       3    2  47.0  7.0000      0
2       2    1  62.0  9.6875      0



In [3624]:

    
#explore(submission_df)



In [3625]:

    
#train_df.plot()



In [3626]:

    
#train_df.plot(kind='box')



In [3627]:

    
#train_df['Age'].plot()



In [3628]:

    
#train_df['Fare'].plot(kind='hist', alpha=0.7, bins=20)



In [3629]:

    
#train_df.hist(figsize=(10, 8))
#plt.show()



In [3630]:

    
#sns.pairplot(train_df[['Age', 'Embarked', 'Fare', 'Survived']], hue="Survived")
#plt.show()



In [3631]:

    
#train_df[train_df.loc[:,'Survived'] == 0]['Sex'].value_counts()



In [3632]:

    
#sns.factorplot("Survived", hue='Sex', data=train_df)
#plt.show()



In [3633]:

    
#train_df[train_df.loc[:,'Survived'] == 0]['Pclass'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Pclass'].hist(color='green', alpha=0.7)
#plt.show()



In [3634]:

    
#train_df[train_df.loc[:,'Survived'] == 0]['Sex'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Sex'].hist(color='green', alpha=0.7)
#plt.show()



In [3635]:

    
#train_df[train_df.loc[:,'Survived'] == 0]['Age'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Age'].hist(color='green', alpha=0.7)
#plt.show()



In [3636]:

    
#train_df[train_df.loc[:,'Survived'] == 0]['SibSp'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['SibSp'].hist(color='green', alpha=0.7)
#plt.show()



In [3637]:

    
#train_df[train_df.loc[:,'Survived'] == 0]['Parch'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Parch'].hist(color='green', alpha=0.7)
#plt.show()



In [3638]:

    
#train_df[train_df.loc[:,'Survived'] == 0]['Fare'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Fare'].hist(color='green', alpha=0.7)
#plt.show()



In [3639]:

    
#train_df[train_df.loc[:,'Survived'] == 0]['Embarked'].hist(color='black', bins=7)
#train_df[train_df.loc[:,'Survived'] == 1]['Embarked'].hist(color='green', alpha=0.7)
#plt.show()



In [3640]:

    
#plt.scatter(train_df[train_df.loc[:,'Survived'] == 0]['Age'], train_df[train_df.loc[:,'Survived'] == 0]['Fare'], alpha=0.5, color='orange')
#plt.scatter(train_df[train_df.loc[:,'Survived'] == 1]['Age'], train_df[train_df.loc[:,'Survived'] == 1]['Fare'], alpha=0.3, color='blue')
#plt.show()



In [3641]:

    
# based on histograms, drop Age, SibSp, Parch, Embarked
#cols_to_drop = ['Age', 'SibSp', 'Parch', 'Embarked']
#train_df = train_df.drop(cols_to_drop, axis=1)
#submission_df = submission_df.drop(cols_to_drop, axis=1)



In [3642]:

    
train_df['ClassFare'] = train_df['Pclass'] * train_df['Fare']
submission_df['ClassFare'] = submission_df['Pclass'] * submission_df['Fare']



In [3643]:

    
train_y = train_df['Survived']
train_df = train_df.drop('Survived', axis=1)



In [3644]:

    
train_X, test_X, train_y, test_y = train_test_split(train_df, train_y, test_size=0.20, random_state=1)



In [3645]:

    
clf = RandomForestClassifier(n_estimators = 300) # 10, 30, 100, 300
clf = clf.fit(train_X,train_y)



In [3646]:

    
clf.score(test_X,test_y)
# BEST LOCAL SCORE AFTER STARTED USING train_test_split and numericalizing everything: 0.86033519553072624
# WHEN LOCAL: 0.86033519553072624, KAGGLE WAS: 0.77990 
# BEST KAGGLE: 0.78469, WHEN LOCAL WAS: 0.85474860335195535









    Out[3646]:





0.8044692737430168



In [3658]:

    
clf = linear_model.LogisticRegression()
clf = clf.fit(train_X, train_y)



In [3659]:

    
clf.score(test_X,test_y)
# BEST KAGGLE: 0.76555, WHEN LOCAL WAS: 0.81005586592178769









    Out[3659]:





0.81005586592178769



In [3649]:

    
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf = clf.fit(train_X, train_y)
clf.score(test_X,test_y)









    Out[3649]:





0.71508379888268159



In [3660]:

    
# simply use this to get submission dataframe
submission_y_df = load_data('data/genderclassmodel.csv')









    



SHAPE: (418, 2)

HEAD:
   PassengerId  Survived
0          892         0
1          893         1
2          894         0



In [3661]:

    
submission_y_df['Survived'] = clf.predict(submission_df)



In [3662]:

    
submission_y_df.to_csv('data/predictions.csv', index=False)



In [3653]:

    
#test_pred = clf.predict(submission_df) # get predictions
#clf.score(submission_df,submission_y['Survived'])
### THESE LOCAL SCORES MEAN NOTHING BECAUSE I WAS COMPARING TO THE GENDERCLASSMODEL
#BEST LOCAL (when using all test data locally): 0.88995215311004783
#BEST LOCAL (when using all test data locally): 1.0, which is weird, so...
#BEST KAGGLE (when using all test data locally): 0.76555



In [3654]:

    
#submission_y = load_data('data/gendermodel.csv')
#submission_y = load_data('data/genderclassmodel.csv') # simply use this to get submission dataframe
# GENDERCLASSMODEL scores 0.77990 on KAGGLE



In [3655]:

    
#indices = np.random.choice(submission_y.shape[0], size=(submission_y.shape[0]/2), replace=False)
#test_pred = clf.predict(submission_df.loc[indices,:])
#clf.score(submission_df.loc[indices,:],submission_y.loc[indices,:]['Survived'])
### THESE LOCAL SCORES MEAN NOTHING BECAUSE I WAS COMPARING TO THE GENDERCLASSMODEL
#BEST LOCAL (when using 50% of test data locally): 0.91866028708133973
#BEST LOCAL (when using 50% of test data locally): 1.0, which is weird, so...
#BEST KAGGLE (when using 50% of test data locally): 0.77990



In [3656]:

    
#pred_df = submission_y
#pred_df['Survived'] = clf.predict(submission_df)
#pred_df['Survived'] = test_pred
#pred_df.head(1)
#BEST KAGGLE: 0.77990



In [3657]:

    
#pred_df.to_csv('data/predictions.csv', index=False)