I got lots of the ideas for this first Kaggle advanture from here
In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
train_df = pd.read_csv("train.csv",dtype={"Age":np.float64},)
In [2]:
train_df.head()
Out[2]:
In [3]:
# find how many ages
train_df['Age'].count()
Out[3]:
In [4]:
# how many ages are NaN?
train_df['Age'].isnull().sum()
Out[4]:
In [5]:
# plot ages of training data set, with NaN's removed
train_df['Age'].dropna().astype(int).hist(bins=70)
print 'Mean age = ',train_df['Age'].dropna().astype(int).mean()
In [6]:
train_df['Embarked'].head()
Out[6]:
In [7]:
train_df.info()
In [8]:
train_df['Embarked'].isnull().sum()
Out[8]:
In [9]:
train_df["Embarked"].count()
Out[9]:
In [10]:
sns.countplot(x="Embarked",data=train_df)
Out[10]:
In [11]:
sns.countplot(x='Survived',hue='Embarked',data=train_df,order=[0,1])
Out[11]:
In [12]:
embark_survive_perc = train_df[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_survive_perc,order=['S','C','Q'])
Out[12]:
In [13]:
train_df['Fare'].astype(int).plot(kind='hist',bins=100, xlim=(0,50))
Out[13]:
In [14]:
# get fare for survived & didn't survive passengers
fare_not_survived = train_df["Fare"].astype(int)[train_df["Survived"] == 0]
fare_survived = train_df["Fare"].astype(int)[train_df["Survived"] == 1]
# get average and std for fare of survived/not survived passengers
avgerage_fare = DataFrame([fare_not_survived.mean(), fare_survived.mean()])
std_fare = DataFrame([fare_not_survived.std(), fare_survived.std()])
avgerage_fare.index.names = std_fare.index.names = ["Survived"]
avgerage_fare.plot(yerr=std_fare,kind='bar',legend=False)
Out[14]:
In [15]:
# get average, std, and number of NaN values in titanic_df
average_age_train = train_df["Age"].mean()
std_age_train = train_df["Age"].std()
count_nan_age_train = train_df["Age"].isnull().sum()
# generate random numbers between (mean - std) & (mean + std)
## ORIGINAL
rand_1 = np.random.randint(average_age_train - std_age_train, average_age_train + std_age_train, size = count_nan_age_train)
train_df['Age'][np.isnan(train_df["Age"])] = rand_1 ## Only way that works, but raises warnings
#df_rand = pd.DataFrame(rand_1)
# create random dummy dataframe
#dfrand = pd.DataFrame(data=np.random.randn(train_df.shape[0],train_df.shape[1]))
#dfrand.info()
#train_df[np.isnan(train_df["Age"])] = dfrand[np.isnan(train_df["Age"])] ## DOESN"T WORK!!!
#
#train_df["Age"].fillna(value=rand_1, inplace=True)
#print df_rand
#train_df["Age"][np.isnan(train_df["Age"])] = df_rand[np.isnan(train_df["Age"])]
#train_df["Age"].isnull().sum()
In [19]:
# replace NaN values with randoms
#train_df["Age"][np.isnan(train_df["Age"])] = rand_1
#train_df.loc[:,('Age')][np.isnan(train_df["Age"])] = rand_1
#train_df['Age'] = train_df['Age'].fillna(np.random.randint(average_age_train - std_age_train, average_age_train + std_age_train))
#train_df["Age"]
train_df["Age"] = train_df["Age"].astype(int)
# plot new Age Values
train_df['Age'].hist(bins=70)
# Compare this to that from a few cells up for the raw ages with the NaN's dropped. Not much different actually.
Out[19]:
In [20]:
## Let's make a couple nice plots of survival vs age
# peaks for survived/not survived passengers by their age
"""
facet = sns.FacetGrid(train_df, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, train_df['Age'].max()))
facet.add_legend()
"""
Out[20]:
In [21]:
# average survived passengers by age
"""
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_age = train_df[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=average_age)
"""
Out[21]:
In [ ]: