In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy
from numpy import corrcoef, sum, log, arange, exp, isnan
import csv
import nltk
titanic = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print("Training set : %i (%0.2f) , Test set : %i (%0.2f)" % (len(titanic),float(len(titanic.dropna()))/len(titanic)*100,len(test),float(len(test.dropna()))/len(test)*100))
print("Features : %i" % len(titanic.columns))
for i in titanic.columns:
print(i,len(titanic[titanic[i].notnull()]),get_my_column_type(titanic[i]),len(titanic[i].unique()))
print(", ".join(titanic.columns))
In [138]:
titles = {'Mr.':'Mr.', 'Miss.':'Miss.','Mrs.':'Mrs.','Master':'Master',
'Ms.':'Mrs.', 'Mme.':'Mrs.', 'Countess.':'Mrs.',
'Miss.':'Miss.','Mlle.':'Miss.',
'Don.':'Mr.','Rev.':'Mr.','Dr.':'Mr.', 'Major.':'Mr.', 'Col.':'Mr.', 'Capt.':'Mr.', 'Jonkheer.':'Mr.'}
titanic["title"] = ''
test["title"] = ''
for title in titles:
titanic.loc[(titanic.Name.str.contains(title))&(titanic.title == ''),'title'] = titles[title]
test.loc[(test.Name.str.contains(title))&(test.title == ''),'title'] = titles[title]
In [178]:
ticket_list = {}
titanic["on_the_ticket"] = 0
test["on_the_ticket"] = 0
for the_ticket in titanic.Ticket.dropna():
if the_ticket in ticket_list:
ticket_list[the_ticket] += 1
else:
ticket_list[the_ticket] = 0
for ticket in ticket_list:
titanic.loc[(titanic.Ticket==ticket)&(titanic.SibSp==0)&(titanic.SibSp==0)&(titanic.Parch==0),"on_the_ticket"] = ticket_list[ticket]
test.loc[(test.Ticket==ticket)&(test.SibSp==0)&(test.Parch==0),"on_the_ticket"] = ticket_list[ticket]
titanic["total_relatives"] = titanic["on_the_ticket"] + titanic["SibSp"] + titanic["Parch"]
test["total_relatives"] = test["on_the_ticket"] + test["SibSp"] + test["Parch"]
titanic["in_a_group"] = (titanic["total_relatives"]>0)
test["in_a_group"] = (test["total_relatives"]>0)
In [200]:
death_counts = pd.crosstab([titanic["title"],titanic["Pclass"]], titanic["Survived"].astype(bool))
death_counts.plot(kind='bar', stacked=True, color=['black','gold'], grid=False)
death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['black','gold'])
Out[200]:
In [71]:
bag_of_words = {}
for the_name in titanic.Name.dropna():
for token in nltk.tokenize.word_tokenize(the_name):
if token in bag_of_words:
bag_of_words[token] += 1
else:
bag_of_words[token] = 1
sorted([(x,bag_of_words[x]) for x in bag_of_words if len(x)>2],key=lambda x:x[1],reverse=True)[0:15]
Out[71]:
In [198]:
the_type = "Mr."
death_counts = pd.crosstab([titanic[titanic['title']==the_type]["Pclass"],titanic[titanic['title']==the_type]["Age"]//10], titanic[titanic['title']==the_type]["Survived"].astype(bool))
death_counts.plot(kind='bar', stacked=True, color=['black','gold'], grid=False)
death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['black','gold'])
Out[198]:
In [137]:
titanic.loc[(titanic.Ticket.str.contains("237736"))]
Out[137]:
In [192]:
titanic[(titanic.Pclass==3)&(titanic.Cabin.notnull())]
Out[192]:
In [227]:
the_type = 2
death_counts = pd.crosstab([titanic[titanic['Pclass']==the_type]["Embarked_num"],titanic[titanic['Pclass']==the_type]["title"]], titanic[titanic['Pclass']==the_type]["Survived"].astype(bool))
death_counts.plot(kind='bar', stacked=True, color=['black','gold'], grid=False)
death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['black','gold'])
Out[227]:
In [ ]:
cols_describe = []
for i in titanic.columns:
cols_describe.append(i,get_my_column_type(titanic[i]),len(titanic[i].unique()))
titles_num = {'Mr.':4, 'Miss.':2,'Mrs.':3,'Master':1}
titanic["title_num"] = 0
test["title_num"] = 0
for title in titles_num:
titanic.loc[(titanic.Name.str.contains(title))&(titanic.title_num == 0),'title_num'] = titles_num[title]
test.loc[(test.Name.str.contains(title))&(test.title_num == 0),'title_num'] = titles_num[title]
In [208]:
get_my_column_type(titanic["Sex"]) == str
Out[208]:
In [224]:
cols_describe = []
my_dataframes = [titanic,test]
cat_classif = {}
cat_classif["title"] = {'Mr.':4, 'Miss.':2,'Mrs.':3,'Master':1}
for i in titanic.columns:
cols_describe.append([i,get_my_column_type(titanic[i]),len(titanic[i].dropna().unique())])
for categorie_d in [(x[0],x[2]) for x in cols_describe if x[1] == str and x[2] < 10]:
categorie = categorie_d[0]
cat_name = categorie+"_num"
for df in my_dataframes:
df[cat_name] = 0
if categorie in cat_classif:
my_classif = cat_classif[categorie]
else:
my_categories_list = set()
for df in my_dataframes:
my_categories_list = my_categories_list.union(set(df[categorie].unique()))
my_categories_list = sorted(list(my_categories_list))
cat_classif = {i:my_categories_list.index(i)+1 for i in my_categories_list}
for class_cat in cat_classif:
for df in my_dataframes:
df.loc[(df[categorie] == class_cat )&(df[cat_name] == 0),cat_name] = cat_classif[class_cat]
In [223]:
set(my_categories_list).union({'a'})
Out[223]:
In [225]:
titanic[0:2]
Out[225]:
In [ ]: