In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from numpy import corrcoef, sum, log, arange, exp, isnan
from numpy.random import rand
from statsmodels import api as sm
import csv
import numpy as np
import nltk
titanic = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
for Pclass in titanic['Pclass'].unique():
titanic["isClass"+str(Pclass)] = map(lambda x:(0,1)[x==Pclass],titanic["Pclass"])
test["isClass"+str(Pclass)] = map(lambda x:(0,1)[x==Pclass],test["Pclass"])
for Embarked in titanic['Embarked'].unique():
titanic["Embarked"+str(Embarked)] = map(lambda x:(0,1)[x==Embarked],titanic["Embarked"])
test["Embarked"+str(Embarked)] = map(lambda x:(0,1)[x==Embarked],test["Embarked"])
titanic['SexL'] = titanic['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
titanic["SexL"] = map(lambda x:(0,1)[x=='male'],titanic["Sex"])
test["SexL"] = map(lambda x:(0,1)[x=='male'],test["Sex"])
titanic["isCabine"] = 0
test["isCabine"] = 0
for cabin in titanic[titanic["Cabin"].notnull()].Cabin.unique():
titanic.loc[((titanic["Cabin"]==cabin)),"isCabine"] = 1
for cabin in test[test["Cabin"].notnull()].Cabin.unique():
test.loc[((test["Cabin"]==cabin)),"isCabine"] = 1
#all = regex.findall(cabin)
my_tickets = {}
for passager in titanic.iterrows():
if passager[1].Ticket in my_tickets:
my_tickets[passager[1].Ticket] += 1
else:
my_tickets[passager[1].Ticket] = 0
for passager in test.iterrows():
if passager[1].Ticket in my_tickets:
my_tickets[passager[1].Ticket] += 1
else:
my_tickets[passager[1].Ticket] = 0
titanic["ticket_nb"] = 0
test["ticket_nb"] = 0
titanic["relative"] = titanic["SibSp"]+ titanic["Parch"]
test["relative"] = test["SibSp"]+ test["Parch"]
for tickets in titanic.Ticket.unique():
titanic.loc[((titanic["Ticket"]==tickets)),"ticket_nb"] = my_tickets[tickets]
for tickets in test.Ticket.unique():
test.loc[((test["Ticket"]==tickets)),"ticket_nb"] = my_tickets[tickets]
titanic["relativeA"] = titanic["relative"]+ titanic["ticket_nb"]
test["relativeA"] = test["relative"]+ test["ticket_nb"]
median_ages = np.zeros((2,3))
median_fares = np.zeros((3))
titanic["AgeC"] = titanic.Age
test["AgeC"] = test.Age
titanic["FareC"] = titanic.Fare
test["FareC"] = test.Fare
for s in titanic["SexL"].unique():
for c in titanic["Pclass"].unique():
median_ages[s,c-1] = titanic[(titanic['SexL'] == s) & (titanic['Pclass'] == c)]['Age'].dropna().median()
titanic.loc[ (titanic.Age.isnull()) & (titanic.SexL == s) & (titanic.Pclass == c), 'AgeC'] = median_ages[s,c-1]
test.loc[ (test.Age.isnull()) & (test.SexL == s) & (test.Pclass == c), 'AgeC'] = median_ages[s,c-1]
for c in titanic["Pclass"].unique():
median_fares[c-1] = titanic[(titanic['Pclass'] == c)]['Fare'].dropna().median()
titanic.loc[ (titanic.Fare.isnull()) & (titanic.Pclass == c), 'FareC'] = median_fares[c-1]
test.loc[ (test.Fare.isnull()) & (test.Pclass == c), 'FareC'] = median_fares[c-1]
titanic["FareClog"] = log(titanic.FareC)
test["FareClog"] = log(test.FareC)
titanic["AgeD"] = titanic.AgeC
test["AgeD"] = test.AgeC
stat = {}
for i in titanic[(titanic.Age.isnull())].iterrows():
for token in nltk.word_tokenize(i[1]["Name"]):
if token == "Master":
titanic.loc[(titanic.PassengerId == i[1]["PassengerId"]), 'AgeD']= 5
for i in test[(test.Age.isnull())].iterrows():
for token in nltk.word_tokenize(i[1]["Name"]):
if token == "Master":
test.loc[(test.PassengerId == i[1]["PassengerId"]), 'AgeD']= 5
titanic['intercept']=1.0
test['intercept']=1.0
titanic["SexM"] = titanic.SexL
test["SexM"] = test.SexL
test.loc[(test.AgeD < 12), 'SexM'] = 0
titanic.loc[(titanic.AgeD < 12), 'SexM'] = 0
cols = ['SexL','SexM','isClass1','isClass2','isClass3','AgeC','AgeD','FareC','relative','SibSp','Parch','EmbarkedS','EmbarkedC','EmbarkedQ','Embarkednan','intercept']
ttt = pd.DataFrame(titanic,columns=['Survived'] + cols).dropna()
t= {}
for Pclass in titanic['Pclass'].unique():
t[Pclass] = pd.DataFrame(titanic[titanic["Pclass"]==Pclass],columns=['Survived','Age','SexL','Fare','intercept']).dropna()
titanic[titanic["Ticket"]=="PC 17757"]
Out[1]:
In [76]:
print(len(test), len(test[cols].dropna()))
In [71]:
titanic["Age"].dropna().hist()
Out[71]:
In [83]:
death_counts = pd.crosstab([titanic["Embarked"],titanic["Pclass"], titanic["Sex"]], titanic["Survived"].astype(bool))
death_counts.plot(kind='bar', stacked=True, color=['black','gold'], grid=False)
death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['black','gold'])
Out[83]:
In [83]:
In [77]:
titanic[(titanic["Cabin"].notnull()) & (titanic["Pclass"]>1)][["Pclass","Cabin","Survived"]]
Out[77]:
In [7]:
Pclass = 3
data = rand(3,len(t[Pclass]))
data[0,:] = pd.Series(t[Pclass]['Survived'])
data[1,:] = pd.Series(t[Pclass]['Age'])
data[2,:] = pd.Series(t[Pclass]['Fare'])
plt.figure()
R = corrcoef(data)
plt.pcolor(abs(R))
plt.colorbar()
plt.show()
In [29]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3,min_samples_leaf=5)
clf = clf.fit(ttt[['SexL','isClass1','isClass2','intercept']],ttt['Survived'])
export_graphviz(clf, out_file="essai_dot_data_tree",feature_names=['SexL','isClass1','isClass2','intercept'])
In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn import cross_validation
for i in range(1,len(cols)):
forest = RandomForestClassifier(n_estimators = 100)
scores = cross_validation.cross_val_score(forest, pd[cols], pd['Label'], cv=5)
forest = forest.fit(ttt[cols[0:i]],ttt['Survived'])
print(i,forest.score(ttt[cols[0:i]],ttt['Survived']),cols[0:i])
print(forest.feature_importances_)
test.isnull().any(axis=1)#type(export_graphviz(forest.estimators_[0]))
#export_graphviz(forest.estimators_[0], out_file="essai_dot_data",feature_names=['SexL','isClass1','isClass2','intercept'])
Out[59]:
In [378]:
in_rows_logit = ['SexM', 'relativeA','isClass2','isClass3','EmbarkedQ','EmbarkedS','AgeC']
in_rows_logit.append('intercept')
logit = sm.Logit(titanic['Survived'], titanic[in_rows_logit])
result=logit.fit()
In [378]:
In [379]:
def pz(passager,coeff):
z=0
for i in coeff.index:
z += coeff[i]*passager[i]
return 1/(1+exp(-1*z))
In [380]:
essai = []
for i in ttt.iterrows():
prev=pz(i[1],coeff)
essai.append([prev,(prev>.5)==i[1]["Survived"] , prev, (prev>.6), (prev<.4)])
float(reduce(lambda p,c:p+c[1],essai,0))/len(ttt)
Out[380]:
In [380]:
In [381]:
soumission = []
for i in test.iterrows():
prev=pz(i[1],coeff)
soumission.append([i[1]["PassengerId"],(prev>.5) , prev, (prev>.6), (prev<.4)])
In [382]:
reduce(lambda p,c : p + (0,1)[c[1]],soumission,0)
Out[382]:
In [376]:
In [238]:
In [249]:
import csv
myfile = open('th.csv', 'wb')
wr = csv.writer(myfile,delimiter=',',quoting=csv.QUOTE_NONE, escapechar =" ")
wr.writerow(('PassengerId', 'Survived'))
wr.writerows([(x[0], int(x[1])) for x in soumission])
myfile.close()
In [239]:
In [249]:
In [368]:
In [35]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import cross_validation
import itertools
cols = ['SexM','FareC','isClass1','isClass2','AgeC','relative','relativeA','ticket_nb','Embarkednan','isClass3','AgeD','EmbarkedQ','SibSp','Parch','EmbarkedS','EmbarkedC','intercept']
the_cols = cols[0:-1]
in_rows = []
for count in range(0,len(the_cols)):
result = []
for i in the_cols:
forest = RandomForestClassifier(n_estimators = 100)
#forest = ExtraTreesClassifier(bootstrap=False, compute_importances=None,criterion='gini', max_depth=None, max_features='auto',max_leaf_nodes=None, min_density=None, min_samples_leaf=1,min_samples_split=2, n_estimators=100, n_jobs=1,oob_score=False, random_state=None, verbose=0)
scores = cross_validation.cross_val_score(forest, titanic[in_rows + [i]], titanic['Survived'], cv=3)
result.append([scores, i])
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2),i,len(in_rows))
#forest = forest.fit(titanic[cols[0:i]], titanic['Survived'])
#print(sorted(zip(cols[0:i],forest.feature_importances_),key=lambda x:-x[1]))
result=sorted(result,key=lambda x:-x[0].mean()+2*x[0].std())
actual = result[0]
in_rows.append(actual[1])
del the_cols[the_cols.index(actual[1])]
print(in_rows)
#type(export_graphviz(forest.estimators_[0]))
#export_graphviz(forest.estimators_[0], out_file="essai_dot_data",feature_names=['SexL','isClass1','isClass2','intercept'])
In [303]:
import csv
n_dim=3
print(in_rows[0:n_dim])
forest = forest.fit(titanic[list(('SexM', 'relative', 'EmbarkedQ', 'Embarkednan', 'isClass2','isClass3'))], titanic['Survived'])
ab_n = zip(test["PassengerId"],forest.predict(test[list(('SexM', 'relative', 'EmbarkedQ', 'Embarkednan', 'isClass2','isClass3'))]))
myfile = open('rf.csv', 'wb')
wr = csv.writer(myfile,delimiter=',',quoting=csv.QUOTE_NONE, escapechar =" ")
wr.writerow(('PassengerId', 'Survived'))
wr.writerows(((x[0], int(x[1])) for x in ab))
myfile.close()
In [38]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import cross_validation
import itertools
#cols = ['SexM','FareC','isClass1','isClass2','AgeC','relative','Embarked','isClass3','AgeD','EmbarkedQ','SibSp','Parch','EmbarkedS','EmbarkedC','intercept']
the_cols = in_rows[0:7]+['SexL']
result = []
for i in itertools.combinations(the_cols,5):
#forest = RandomForestClassifier(n_estimators = 100)
forest = ExtraTreesClassifier(bootstrap=False, compute_importances=None,criterion='gini', max_depth=None, max_features='auto',max_leaf_nodes=None, min_density=None, min_samples_leaf=1,min_samples_split=2, n_estimators=100, n_jobs=1,oob_score=False, random_state=None, verbose=0)
scores = cross_validation.cross_val_score(forest, titanic[list(i)], titanic['Survived'], cv=5)
result.append([scores, i])
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2),i,len(in_rows))
In [135]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,AdaBoostClassifier)
from sklearn import cross_validation
from sklearn import clone
# Parameters
n_classes = 2
n_estimators = 100
RANDOM_SEED = 45 # fix the seed on each iteration
models = [
DecisionTreeClassifier(max_depth=10),
#RandomForestClassifier(n_estimators=n_estimators),
#ExtraTreesClassifier(n_estimators=5),
#AdaBoostClassifier(DecisionTreeClassifier(max_depth=10),n_estimators=n_estimators)
]
cols=['SexM', 'relativeA', 'isClass3', 'SibSp', 'relative', 'ticket_nb', 'Parch', 'EmbarkedS', 'isClass2', 'EmbarkedQ', 'isClass1', 'EmbarkedC', 'FareC', 'AgeC', 'AgeD']
for i in range(1,len(cols)):
X_train, X_test,y_train , y_test = cross_validation.train_test_split(titanic[cols[0:i]],titanic['Survived'],test_size=0.15,random_state=12312)
idx = np.arange(X_train.shape[0])
np.random.seed(RANDOM_SEED)
np.random.shuffle(idx)
X_train = X_train[idx]
y_train = y_train[idx]
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
X_train = (X_train - mean) / std
# Train
clf = clone(model)
clf = model.fit(X_train, y_train)
over_fit_score = clf.score(X_train, y_train)
scores = clf.score(X_test, y_test)
print(scores,over_fit_score)
import csv
forest = forest.fit(titanic[cols], titanic['Survived'])
ab_e = zip(test["PassengerId"],forest.predict(test[cols]))
#myfile = open('extra.csv', 'wb')
#wr = csv.writer(myfile,delimiter=',',quoting=csv.QUOTE_NONE, escapechar =" ")
#wr.writerow(('PassengerId', 'Survived'))
#wr.writerows(((x[0], int(x[1])) for x in ab_e))
#myfile.close()
In [88]:
my_tickets = {}
for passager in titanic.iterrows():
if passager[1].Ticket in my_tickets:
my_tickets[passager[1].Ticket] += 1
else:
my_tickets[passager[1].Ticket] = 0
In [111]:
import re
regex = re.compile("([A-I])([0-9])+")
for cabin in titanic[titanic["Cabin"].notnull()].Cabin.unique():
all = regex.findall(cabin)
if len(all) > 0:
print(cabin,all[0][0],all[0][1],ord(all[0][0])-64,int(all[0][1])%2)
In [98]:
In [98]:
In [ ]: