Fortunately, we have the ship's manifest of nearly every single passenger. But using standard python libraries to peak and poke that dataset is kinda of a miserable experience. Fortunately, we have pandas.
Pandas provides an abstraction to load in data, manipulate it, and export your changes.
In [1]:
from __future__ import division
In [2]:
import math
In [3]:
from matplotlib import pyplot as plt
In [4]:
plt.style.use('ggplot')
In [7]:
import pandas as pd
In [8]:
import numpy as np
In [9]:
%matplotlib inline
In [10]:
df = pd.read_csv("data/train.csv")
In [11]:
df.head()
Out[11]:
In [12]:
df.tail()
Out[12]:
In [13]:
#Lets see the types that were imported on our behalf
#df is an object and dtypes is an attribute
df.dtypes
#info is a function
df.info()
In [14]:
df.describe()
Out[14]:
In [15]:
df.shape
Out[15]:
In [16]:
df.columns
Out[16]:
In [17]:
df['Name']
Out[17]:
In [20]:
my_famous_passenger = df[df["Name"] == "Guggenheim, Mr. Benjamin"]
In [23]:
print(my_famous_passenger)
In [24]:
df["Age"].mean()
Out[24]:
In [30]:
df["Fare"].describe
Out[30]:
In [33]:
my_rich_passenger = df[df["Fare"] == df["Fare"].max()]
print(my_rich_passenger)
In [35]:
cols = list(df.columns.values)
print(cols)
In [37]:
cols[-2]
Out[37]:
In [48]:
df_of_women = df[df["Sex"] == "female"]
df_of_men = df[df["Sex"] == "male"]
In [79]:
# Creatre three data frames. Capture them by passenger calss "PClass".
class_type = df["Pclass"].unique()
print(class_type)
df_class_3 = df[df["Pclass"] == class_type[1]]
print(df_class_3.shape[0])
In [74]:
df["FamilySize"] = df["SibSp"] + df["Parch"]
df["FamilySize"].describe()
Out[74]:
In [81]:
df["Age"].hist(bins=16, range=(0,80))
Out[81]:
In [82]:
df["Age"].dropna().hist(bins=16, range=(0,80))
Out[82]:
In [83]:
plt.scatter(df["Fare"],df["Survived"])
Out[83]:
In [84]:
d = {'one':np.random.rand(10),
'two':np.random.rand(10)}
In [85]:
print(d)
In [93]:
df_scrap = pd.DataFrame(d)
df_scrap.describe()
Out[93]:
In [92]:
df_scrap.plot(style=['ro','bx'])
Out[92]:
In [96]:
import statsmodels.api as sm
import pylab as pl
In [112]:
new_df = df
In [113]:
def gender_to_numeric(gender):
if gender == "male":
return 0
else:
return 1
In [114]:
new_df["Sex"] = new_df["Sex"].apply(gender_to_numeric)
new_df = new_df[["Survived","Age","Sex","Pclass"]]
In [115]:
new_df = new_df.dropna()
train_cols = new_df.columns[1:]
print(train_cols)
In [119]:
logit = sm.Logit(new_df["Survived"], new_df[train_cols])
result = logit.fit()
In [120]:
print(result.summary())
In [121]:
Out[121]:
In [127]:
new_df.iloc[[1,7,10],[1,3]]
Out[127]:
In [ ]: