In [46]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# TODO: Dont hard code the path
# Either define a variable to store the path or let input file be in the same folder
# 2nd approach is better -- Did this now.
train_df = pd.read_csv('train.csv') # training data in a pandas' Data Frame object
test_df = pd.read_csv('test.csv') # test data
full_df = [train_df, test_df] # complete pandas' Data Frame object
In [47]:
# first 10 records
train_df.head(10)
Out[47]:
In [48]:
# last 10 records
train_df.tail(10)
Out[48]:
In [49]:
# <Raghu> for now since we are starting, lets go with mean.
# Here in this example, they have used a formula making use of mean and std dev of training data for Age.
# https://www.kaggle.io/svf/560373/fcf6c03312081da830b2ab2cb26b4a1a/__results__.html#6.-Age
# We can do these later to improve efficiency of learning and predicting. </Raghu>
print(train_df['Age'].value_counts(dropna=False)) #How should we handle these? Drop NaNs? Replace wtih mean?
In [50]:
train_df.describe() #Only 38% of passengers survived, average age is 29.67.
Out[50]:
In [51]:
train_df.info()
In [52]:
train_df['Pclass'].plot(kind = 'hist', rot=0, logx=True, logy=True)
#Large majority 3rd class, small portion 1st, even smaleler portion 2nd
Out[52]:
In [53]:
train_df.plot(kind='scatter', x='Age', y='Fare') #Outliers?
Out[53]:
In [54]:
sns.lmplot(x='Age', y='Fare', data=train_df, hue='Pclass')
#Probably not outliers/errors in data, likely just very expensive tickets since they are 1st class passengers?
Out[54]:
In [55]:
sns.residplot(x='Age', y='Fare', data=train_df, dropna=True)
Out[55]:
In [56]:
sns.boxplot(x='Pclass', y='Fare', data=train_df)
Out[56]:
In [57]:
# </Rachel> I went to the link provided above for this calculation, but was confused by the Warning message
# I found a similar method on Github that shows the distribution before and after the random values are generated </Rachel>
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Original Age values - Titanic')
axis2.set_title('New Age values - Titanic')
# plot original Age values (drop null values and convert to int)
train_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)
# get average, std and number of NaN values
average_age = train_df["Age"].mean()
std_age = train_df["Age"].std()
count_nan_age = train_df["Age"].isnull().sum()
# generate random numbers between (mean - std) & (mean + std)
rand_age = np.random.randint(average_age - std_age, average_age + std_age, size = count_nan_age)
# fill NaN values in Age column with random values generated
age_slice = train_df["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
# plot imputed Age values
age_slice.astype(int).hist(bins=70, ax=axis2)
Out[57]:
In [58]:
# Distribution looks good - replace Age vector in original data with new values
train_df["Age"] = age_slice
# Show number of missing Age values
train_df["Age"].isnull().sum()
Out[58]:
In [59]:
# Fill missing values with most common value
train_df['Embarked'] = train_df['Embarked'].fillna('S')
# Show number of missing values
train_df['Embarked'].isnull().sum()
Out[59]:
In [60]:
train_df['Fare'] = train_df.Fare.apply(lambda x: x if x>0 else pd.np.nan) # Replaced zeros with NaNs
train_df['Fare'].isnull().sum() # Checked to make sure they are now recognized as null
Out[60]:
In [61]:
m = train_df.groupby('Pclass').mean().Fare # Calculated mean for each group/class
m
Out[61]:
In [62]:
train_df['Fare'] = train_df.apply(lambda row: m[row['Pclass']] # Replaced NaNs for Fare with the mean value for each class
if pd.isnull(row['Fare'])
else row['Fare'],
axis=1)
train_df['Fare'].isnull().sum() # Checked to make sure there are no longer missing values
Out[62]:
In [63]:
# Transform Embarked (S = 0, C = 1, Q = 2)
train_df['Embarked'] = train_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
# Transform Sex (male = 0, female = 1)
train_df['Sex'] = train_df['Sex'].map({'female': 1, 'male': 0}).astype(int)
In [64]:
# With the target variable "Survived", I would recommend starting with Sex, Age, Pclass and Fare as our predictors...
# Let me know what you think!
# <Raghu> Yes, lets start with these we have to engineer the data (fill NaNs) and clean the data
# to make it less overfitting. </Raghu>