Dan Tamayo
Material draws from a blog working through the Titanic dataset by Dave Novelli, as well as the Pandas Cookbook by Julia Evans
In [70]:
import pandas as pd
df = pd.read_csv('data/train.csv')
In [2]:
df.head()
Out[2]:
In [3]:
df.tail()
Out[3]:
In [4]:
df.shape
Out[4]:
In [5]:
df['Fare'].head()
Out[5]:
In [6]:
df[['Fare', 'Sex']].head()
Out[6]:
In [7]:
df['Sex'].value_counts()
Out[7]:
In [8]:
df['Age'].median()
Out[8]:
In [71]:
%matplotlib inline
import seaborn
fig = df['Pclass'].hist()
In [72]:
fig = df.hist(figsize=(15,5))
In [73]:
df.head()
Out[73]:
In [74]:
mask = df['Embarked'] == 'C'
mask.head()
Out[74]:
In [75]:
df_filter = df[mask] # df_filter = df[df['Embarked'] == 'C']
df_filter.head()
Out[75]:
In [81]:
df_filter = df.loc[df['Embarked'] == 'C']
df_filter.head()
Out[81]:
In [82]:
test = df.loc[df['Age'] > 30., ['Age', 'Fare', 'Sex']]
test.head()
Out[82]:
In [83]:
test = df.loc[(df['Age'] > 30.) & (df['Fare'] < 50.), 'Age':'Fare']
test.head()
Out[83]:
In [84]:
df.head()
Out[84]:
Wrong way:
In [90]:
import numpy as np
df[df['Cabin']!=np.nan]
Out[90]:
Right Way:
In [91]:
df.loc[df['Cabin'].notnull()]
Out[91]:
In [92]:
df.notnull().head()
Out[92]:
In [93]:
df_filter = df.loc[df.notnull().all(axis=1)]
In [94]:
df_filter.shape
Out[94]:
In [105]:
df.loc[df['Cabin'].isnull(), 'Cabin'] = 'U0'
df.head()
Out[105]:
In [106]:
df.loc[df['Fare'].isnull()].shape
Out[106]:
In [107]:
df['Fare'].median()
Out[107]:
In [108]:
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].median()
In [109]:
df.loc[df['Embarked'].isnull()].shape
Out[109]:
In [111]:
df['Embarked'].mode()
Out[111]:
In [112]:
df['Embarked'].mode()[0]
Out[112]:
In [113]:
df.loc[df['Embarked'].isnull(), 'Embarked'] = df['Embarked'].dropna().mode()[0]
In [114]:
for col in df.columns:
print("NaNs in column {0} = {1}".format(col, df.loc[df[col].isnull()].shape[0]))
In [115]:
df.loc[df['Age'].isnull(), 'Age'] = df['Age'].mean()
In [116]:
df.head()
Out[116]:
In [117]:
print(df['Embarked'].unique())
In [118]:
dummies_df = pd.get_dummies(df['Embarked'])
dummies_df.head()
Out[118]:
In [119]:
def addEmbarked(name):
return 'Embarked_' + name
dummies_df = dummies_df.rename(columns=addEmbarked)
dummies_df.head()
Out[119]:
In [120]:
df = pd.concat([df, dummies_df], axis=1)
df.head()
Out[120]:
In [121]:
df['EmbarkedNum'] = pd.factorize(df['Embarked'])[0]
df.head(6)
Out[121]:
In [122]:
pd.factorize(df['Sex'])
Out[122]:
In [123]:
df['Female'] = pd.factorize(df['Sex'])[0]
df.head()
Out[123]:
In [124]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(df[['Age', 'Fare']]) # .fit(df) to scale all numerical columns
print("Means = {0}".format(scaler.mean_))
print("Stdevs = {0}".format(scaler.scale_))
df[['Age', 'Fare']] = scaler.transform(df[['Age', 'Fare']])
df.head()
Out[124]:
In [125]:
fares = df['Fare'].values
type(fares)
Out[125]:
In [126]:
input_df = pd.read_csv('data/train.csv')
submit_df = pd.read_csv('data/test.csv')
print(input_df.shape)
print(submit_df.shape)
In [127]:
submit_df.head()
Out[127]:
In [128]:
input_df.head()
Out[128]:
In [129]:
submit_df = pd.read_csv('data/test.csv', index_col=0)
submit_df.head()
Out[129]:
In [130]:
input_df = pd.read_csv('data/train.csv', index_col=0)
submit_df = pd.read_csv('data/test.csv', index_col=0)
df = pd.concat([input_df, submit_df])
df.tail()
Out[130]:
In [131]:
print(df.shape[1], "columns")
print(df.shape[0], "rows")
print(df.columns.values)
In [49]:
def process_data(df):
df['Female'] = pd.factorize(df['Sex'])[0]
df.loc[df['Age'].isnull(), 'Age'] = df['Age'].mean()
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].median()
df.loc[df['Cabin'].isnull(), 'Cabin'] = 'U0'
df.loc[df['Embarked'].isnull(), 'Embarked'] = df['Embarked'].dropna().mode()[0]
dummies_df = pd.get_dummies(df['Embarked'])
def addEmbarked(name):
return 'Embarked_' + name
dummies_df = dummies_df.rename(columns=addEmbarked)
df = pd.concat([df, dummies_df], axis=1)
df['EmbarkedNum'] = pd.factorize(df['Embarked'])[0]
return df
In [50]:
df = process_data(df)
df.tail()
Out[50]:
In [51]:
features = ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp', 'Female', 'EmbarkedNum']
df_test = df.loc[df['Survived'].isnull(), features]
df_train = df.loc[df['Survived'].notnull(), features+['Survived']]
df_train.head()
Out[51]:
In [52]:
X_train = df_train[features].values
y_train = df_train['Survived'].values
print(X_train[0:5])
print(y_train[0:5])
print("X has {0} rows".format(X_train.shape[0]))
print("y has {0} rows".format(y_train.shape[0]))
In [53]:
df['Cabin'].unique()
Out[53]:
In [54]:
import re
def getDeck(cabin):
match = re.search("([A-Z])", cabin)
return match.group(1) if match is not None else None
def getCabinNum(cabin):
match = re.search("([0-9]+)", cabin)
return match.group(1) if match is not None else None
print(getDeck('C237'))
print(getCabinNum('C237'))
In [55]:
df['Deck'] = df['Cabin'].map(getDeck)
df['CabinNum'] = df['Cabin'].map(getCabinNum)
df.head()
Out[55]:
In [56]:
df['CabinNum'].isnull().value_counts()
Out[56]:
In [57]:
df.loc[df['CabinNum'].isnull(), 'CabinNum'] = 0
In [58]:
df['Deck'].isnull().value_counts()
Out[58]:
In [59]:
df['DeckNum'] = pd.factorize(df['Deck'])[0]
In [60]:
testname = df.loc[1, 'Name']
print(testname)
In [61]:
re.split(' ', testname)
Out[61]:
In [62]:
def numNames(name):
return len(re.split(' ', name))
df['NumNames'] = df['Name'].map(numNames)
df.head()
Out[62]:
In [63]:
testname
Out[63]:
In [64]:
re.search(", (.+?)\.", testname).group(1)
Out[64]:
In [65]:
def getTitle(name):
match = re.search(", (.*?)\.", name)
return match.group(1) if match is not None else None
df['Title'] = df['Name'].map(getTitle)
df.head()
Out[65]:
In [66]:
df.loc[df['Title'] == 'Jonkheer', 'Title'] = 'Master'
df.loc[df['Title'].isin(['Ms', 'Mlle']), 'Title'] = 'Miss'
df.loc[df['Title'] == 'Mme', 'Title'] = 'Mrs'
df.loc[df['Title'].isin(['Capt', 'Don', 'Major', 'Col', 'Sir']), 'Title'] = 'Sir'
df.loc[df['Title'].isin(['Dona', 'Lady', 'the Countess']), 'Title'] = 'Lady'
df['Title'].value_counts()
Out[66]:
In [67]:
df = pd.concat([df, pd.get_dummies(df['Title']).rename(columns=lambda x: 'Title_' + str(x))], axis=1)
df.head()
Out[67]:
In [68]:
def process_data(df):
df['Female'] = pd.factorize(df['Sex'])[0]
df.loc[df['Age'].isnull(), 'Age'] = df['Age'].mean()
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].median()
df.loc[df['Cabin'].isnull(), 'Cabin'] = 'U0'
df.loc[df['Embarked'].isnull(), 'Embarked'] = df['Embarked'].dropna().mode()[0]
dummies_df = pd.get_dummies(df['Embarked'])
def addEmbarked(name):
return 'Embarked_' + name
dummies_df = dummies_df.rename(columns=addEmbarked)
df = pd.concat([df, dummies_df], axis=1)
df['EmbarkedNum'] = pd.factorize(df['Embarked'])[0]
import re
def getDeck(cabin):
match = re.search("([A-Z])", cabin)
return match.group(1) if match is not None else None
def getCabinNum(cabin):
match = re.search("([0-9]+)", cabin)
return match.group(1) if match is not None else None
df['Deck'] = df['Cabin'].map(getDeck)
df['DeckNum'] = pd.factorize(df['Deck'])[0]
df['CabinNum'] = df['Cabin'].map(getCabinNum)
df.loc[df['CabinNum'].isnull(), 'CabinNum'] = 0
def numNames(name):
return len(re.split(' ', name))
df['NumNames'] = df['Name'].map(numNames)
def getTitle(name):
match = re.search(", (.*?)\.", name)
return match.group(1) if match is not None else None
df['Title'] = df['Name'].map(getTitle)
df.loc[df['Title'] == 'Jonkheer', 'Title'] = 'Master'
df.loc[df['Title'].isin(['Ms', 'Mlle']), 'Title'] = 'Miss'
df.loc[df['Title'] == 'Mme', 'Title'] = 'Mrs'
df.loc[df['Title'].isin(['Capt', 'Don', 'Major', 'Col', 'Sir']), 'Title'] = 'Sir'
df.loc[df['Title'].isin(['Dona', 'Lady', 'the Countess']), 'Title'] = 'Lady'
df = pd.concat([df, pd.get_dummies(df['Title']).rename(columns=lambda x: 'Title_' + str(x))], axis=1)
return df
In [69]:
input_df = pd.read_csv('data/train.csv', index_col=0)
submit_df = pd.read_csv('data/test.csv', index_col=0)
df = pd.concat([input_df, submit_df])
df = process_data(df)
features = ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp', 'Female', 'EmbarkedNum', 'DeckNum', 'CabinNum', 'NumNames', 'Title_Dr','Title_Lady','Title_Master','Title_Miss','Title_Mr','Title_Mrs','Title_Rev','Title_Sir']
df_test = df.loc[df['Survived'].isnull(), features]
df_train = df.loc[df['Survived'].notnull(), features+['Survived']]
df_train.to_csv('data/train_processed.csv') # NOT FEATURE SCALED!
df_test.to_csv('data/test_processed.csv')
In [ ]: