In [1195]:
%run /Users/excalibur/py/scripts/data_wrangling.py functions
%matplotlib inline
In [1196]:
%ls data
In [1197]:
train_df = load_data('data/train.csv')
In [1198]:
submission_df = load_data('data/test.csv')
In [1199]:
#explore(train_df)
In [1200]:
#explore(submission_df)
In [1201]:
#train_df = train_df.replace(to_replace={'Name':{",.*":""}}, regex=True)
#submission_df = submission_df.replace(to_replace={'Name':{",.*":""}}, regex=True)
In [1202]:
train_df = train_df.replace(to_replace={'Cabin':{None:0,".*":1}}, regex=True)
submission_df = submission_df.replace(to_replace={'Cabin':{None:0,".*":1}}, regex=True)
In [1203]:
col_names = ['Sex', 'Embarked']
train_df = numericalize(train_df, col_names)
submission_df = numericalize(submission_df, col_names)
In [1204]:
train_df = drop_obj_cols(train_df)
submission_df = drop_obj_cols(submission_df)
train_df.head()
Out[1204]:
In [1205]:
cols_to_drop = ['PassengerId', 'SibSp', 'Parch', 'Embarked']
train_df = train_df.drop(cols_to_drop, axis=1)
submission_df = submission_df.drop(cols_to_drop, axis=1)
train_df.head()
Out[1205]:
In [1206]:
#age_train_df = train_df[train_df['Age'].notnull()]
#age_train_y_df = age_train_df['Age']
#age_train_X_df = age_train_df.drop('Age', axis=1)
In [1207]:
#print age_train_y_df.head(1)
#print age_train_X_df.head(1)
In [1208]:
#age_train_X, age_test_X, age_train_y, age_test_y = train_test_split(age_train_X_df, age_train_y_df, test_size=0.20, random_state=1)
In [1209]:
#clf = linear_model.LinearRegression()
#clf = clf.fit(age_train_X, age_train_y)
In [1210]:
#clf.score(age_test_X, age_test_y)
In [1211]:
# replace null ages with a statistic (e.g., train_df['Age'].median())
train_df = swap_null(train_df, 'Age', train_df['Age'].mean())
In [1212]:
# replace null ages and fares with respective statistics
submission_df = swap_null(submission_df, 'Age', submission_df['Age'].mean())
submission_df = swap_null(submission_df, 'Fare', submission_df['Fare'].mean())
In [1213]:
train_df.to_csv('data/train_new.csv', index=False)
submission_df.to_csv('data/test_new.csv', index=False)
In [1214]:
#train_df = train_df[train_df['Age'].notnull()]