In [1195]:
%run /Users/excalibur/py/scripts/data_wrangling.py functions
%matplotlib inline


['load_data(filename)', 'explore(df)', 'locate_null(df, col_name)', 'swap_null(df, col_name, value)', 'numericalize(df, col_names)', 'drop_obj_cols(df)']

In [1196]:
%ls data


genderclassmodel.csv  myfirstforest.py      train.csv
genderclassmodel.py   predictions.csv       train_new.csv
gendermodel.csv       test.csv
gendermodel.py        test_new.csv


In [1197]:
train_df = load_data('data/train.csv')


SHAPE: (891, 12)

HEAD:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  

In [1198]:
submission_df = load_data('data/test.csv')


SHAPE: (418, 11)

HEAD:
   PassengerId  Pclass                              Name     Sex   Age  SibSp  \
0          892       3                  Kelly, Mr. James    male  34.5      0   
1          893       3  Wilkes, Mrs. James (Ellen Needs)  female  47.0      1   
2          894       2         Myles, Mr. Thomas Francis    male  62.0      0   

   Parch  Ticket    Fare Cabin Embarked  
0      0  330911  7.8292   NaN        Q  
1      0  363272  7.0000   NaN        S  
2      0  240276  9.6875   NaN        Q  

In [1199]:
#explore(train_df)

In [1200]:
#explore(submission_df)

Swap Nulls and Feature Engineering


In [1201]:
#train_df = train_df.replace(to_replace={'Name':{",.*":""}}, regex=True)
#submission_df = submission_df.replace(to_replace={'Name':{",.*":""}}, regex=True)

In [1202]:
train_df = train_df.replace(to_replace={'Cabin':{None:0,".*":1}}, regex=True)
submission_df = submission_df.replace(to_replace={'Cabin':{None:0,".*":1}}, regex=True)

In [1203]:
col_names = ['Sex', 'Embarked']
train_df = numericalize(train_df, col_names)
submission_df = numericalize(submission_df, col_names)

In [1204]:
train_df = drop_obj_cols(train_df)
submission_df = drop_obj_cols(submission_df)
train_df.head()


DROPPING: Index([u'Name', u'Ticket'], dtype='object')
DROPPING: Index([u'Name', u'Ticket'], dtype='object')
Out[1204]:
PassengerId Survived Pclass Sex Age SibSp Parch Fare Cabin Embarked
0 1 0 3 1 22 1 0 7.2500 0 1
1 2 1 1 2 38 1 0 71.2833 1 2
2 3 1 3 2 26 0 0 7.9250 0 1
3 4 1 1 2 35 1 0 53.1000 1 1
4 5 0 3 1 35 0 0 8.0500 0 1

In [1205]:
cols_to_drop = ['PassengerId', 'SibSp', 'Parch', 'Embarked']
train_df = train_df.drop(cols_to_drop, axis=1)
submission_df = submission_df.drop(cols_to_drop, axis=1)
train_df.head()


Out[1205]:
Survived Pclass Sex Age Fare Cabin
0 0 3 1 22 7.2500 0
1 1 1 2 38 71.2833 1
2 1 3 2 26 7.9250 0
3 1 1 2 35 53.1000 1
4 0 3 1 35 8.0500 0

In [1206]:
#age_train_df = train_df[train_df['Age'].notnull()]
#age_train_y_df = age_train_df['Age']
#age_train_X_df = age_train_df.drop('Age', axis=1)

In [1207]:
#print age_train_y_df.head(1)
#print age_train_X_df.head(1)

In [1208]:
#age_train_X, age_test_X, age_train_y, age_test_y = train_test_split(age_train_X_df, age_train_y_df, test_size=0.20, random_state=1)

In [1209]:
#clf = linear_model.LinearRegression()
#clf = clf.fit(age_train_X, age_train_y)

In [1210]:
#clf.score(age_test_X, age_test_y)


In [1211]:
# replace null ages with a statistic (e.g., train_df['Age'].median())
train_df = swap_null(train_df, 'Age', train_df['Age'].mean())

In [1212]:
# replace null ages and fares with respective statistics
submission_df = swap_null(submission_df, 'Age', submission_df['Age'].mean())
submission_df = swap_null(submission_df, 'Fare', submission_df['Fare'].mean())


In [1213]:
train_df.to_csv('data/train_new.csv', index=False)
submission_df.to_csv('data/test_new.csv', index=False)


In [1214]:
#train_df = train_df[train_df['Age'].notnull()]