In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from numpy import corrcoef, sum, log, arange, exp, isnan
from numpy.random import rand
from statsmodels import api as sm
import csv
import numpy as np
import nltk

titanic = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

for Pclass in titanic['Pclass'].unique():
    titanic["isClass"+str(Pclass)] = map(lambda x:(0,1)[x==Pclass],titanic["Pclass"])
    test["isClass"+str(Pclass)] = map(lambda x:(0,1)[x==Pclass],test["Pclass"])
for Embarked in titanic['Embarked'].unique():
    titanic["Embarked"+str(Embarked)] = map(lambda x:(0,1)[x==Embarked],titanic["Embarked"])
    test["Embarked"+str(Embarked)] = map(lambda x:(0,1)[x==Embarked],test["Embarked"])
titanic['SexL'] = titanic['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
titanic["SexL"] = map(lambda x:(0,1)[x=='male'],titanic["Sex"])
test["SexL"] = map(lambda x:(0,1)[x=='male'],test["Sex"])
titanic["isCabine"] = 0
test["isCabine"] = 0
for cabin in titanic[titanic["Cabin"].notnull()].Cabin.unique():
    titanic.loc[((titanic["Cabin"]==cabin)),"isCabine"] = 1
for cabin in test[test["Cabin"].notnull()].Cabin.unique():
    test.loc[((test["Cabin"]==cabin)),"isCabine"] = 1
    #all = regex.findall(cabin)
my_tickets = {}
for passager in titanic.iterrows():
    if passager[1].Ticket in my_tickets:
        my_tickets[passager[1].Ticket] += 1
    else:
        my_tickets[passager[1].Ticket] = 0
for passager in test.iterrows():
    if passager[1].Ticket in my_tickets:
        my_tickets[passager[1].Ticket] += 1
    else:
        my_tickets[passager[1].Ticket] = 0
titanic["ticket_nb"] = 0
test["ticket_nb"] = 0
titanic["relative"] = titanic["SibSp"]+ titanic["Parch"]
test["relative"] = test["SibSp"]+ test["Parch"] 
for tickets in titanic.Ticket.unique():
    titanic.loc[((titanic["Ticket"]==tickets)),"ticket_nb"] = my_tickets[tickets]
for tickets in test.Ticket.unique():
    test.loc[((test["Ticket"]==tickets)),"ticket_nb"] = my_tickets[tickets]    

titanic["relativeA"] = titanic["relative"]+ titanic["ticket_nb"]
test["relativeA"] = test["relative"]+ test["ticket_nb"] 
median_ages = np.zeros((2,3))
median_fares = np.zeros((3))

titanic["AgeC"] = titanic.Age
test["AgeC"] = test.Age
titanic["FareC"] = titanic.Fare
test["FareC"] = test.Fare

for s in titanic["SexL"].unique():
    for c in titanic["Pclass"].unique():
        median_ages[s,c-1] = titanic[(titanic['SexL'] == s) & (titanic['Pclass'] == c)]['Age'].dropna().median()
        titanic.loc[ (titanic.Age.isnull()) & (titanic.SexL == s) & (titanic.Pclass == c), 'AgeC'] = median_ages[s,c-1]
        test.loc[ (test.Age.isnull()) & (test.SexL == s) & (test.Pclass == c), 'AgeC'] = median_ages[s,c-1]
for c in titanic["Pclass"].unique():
    median_fares[c-1] = titanic[(titanic['Pclass'] == c)]['Fare'].dropna().median()
    titanic.loc[ (titanic.Fare.isnull()) & (titanic.Pclass == c), 'FareC'] = median_fares[c-1]
    test.loc[ (test.Fare.isnull()) & (test.Pclass == c), 'FareC'] = median_fares[c-1]       

        
    
titanic["FareClog"] = log(titanic.FareC)
test["FareClog"] = log(test.FareC)    
titanic["AgeD"] = titanic.AgeC
test["AgeD"] = test.AgeC    
stat = {}    
for i in titanic[(titanic.Age.isnull())].iterrows():
    for token in nltk.word_tokenize(i[1]["Name"]):
        if token == "Master":
            titanic.loc[(titanic.PassengerId == i[1]["PassengerId"]), 'AgeD']= 5
for i in test[(test.Age.isnull())].iterrows():
    for token in nltk.word_tokenize(i[1]["Name"]):
        if token == "Master":
            test.loc[(test.PassengerId == i[1]["PassengerId"]), 'AgeD']= 5
titanic['intercept']=1.0
test['intercept']=1.0
titanic["SexM"] = titanic.SexL
test["SexM"] = test.SexL   
test.loc[(test.AgeD < 12), 'SexM'] = 0
titanic.loc[(titanic.AgeD < 12), 'SexM'] = 0  
cols = ['SexL','SexM','isClass1','isClass2','isClass3','AgeC','AgeD','FareC','relative','SibSp','Parch','EmbarkedS','EmbarkedC','EmbarkedQ','Embarkednan','intercept']
ttt = pd.DataFrame(titanic,columns=['Survived'] + cols).dropna()
t= {}
for Pclass in titanic['Pclass'].unique():
    t[Pclass] = pd.DataFrame(titanic[titanic["Pclass"]==Pclass],columns=['Survived','Age','SexL','Fare','intercept']).dropna()
titanic[titanic["Ticket"]=="PC 17757"]


/usr/local/lib/python2.7/site-packages/numexpr/cpuinfo.py:71: UserWarning: [Errno 2] No such file or directory
  stacklevel=stacklevel+1):
Out[1]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare ... isCabine ticket_nb relative relativeA AgeC FareC FareClog AgeD intercept SexM
380 381 1 1 Bidois, Miss. Rosalie female 42 0 0 PC 17757 227.525 ... 0 4 0 4 42 227.525 5.42726 42 1 0
557 558 0 1 Robbins, Mr. Victor male NaN 0 0 PC 17757 227.525 ... 0 4 0 4 40 227.525 5.42726 40 1 1
700 701 1 1 Astor, Mrs. John Jacob (Madeleine Talmadge Force) female 18 1 0 PC 17757 227.525 ... 1 4 1 5 18 227.525 5.42726 18 1 0
716 717 1 1 Endres, Miss. Caroline Louise female 38 0 0 PC 17757 227.525 ... 1 4 0 4 38 227.525 5.42726 38 1 0

4 rows × 30 columns


In [76]:
print(len(test), len(test[cols].dropna()))


(418, 331)

In [71]:
titanic["Age"].dropna().hist()


Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x10dba7bd0>

In [83]:
death_counts = pd.crosstab([titanic["Embarked"],titanic["Pclass"], titanic["Sex"]], titanic["Survived"].astype(bool))
death_counts.plot(kind='bar', stacked=True, color=['black','gold'], grid=False)
death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['black','gold'])


Out[83]:
<matplotlib.axes._subplots.AxesSubplot at 0x1120098d0>

In [83]:


In [77]:
titanic[(titanic["Cabin"].notnull()) & (titanic["Pclass"]>1)][["Pclass","Cabin","Survived"]]


Out[77]:
Pclass Cabin Survived
10 3 G6 1
21 2 D56 1
66 2 F33 1
75 3 F G73 0
123 2 E101 1
128 3 F E69 1
148 2 F2 0
183 2 F4 1
193 2 F2 1
205 3 G6 0
251 3 G6 0
292 2 D 0
303 2 E101 1
327 2 D 1
340 2 F2 1
345 2 F33 1
394 3 G6 1
429 3 E10 1
473 2 D 1
516 2 F33 1
618 2 F4 1
699 3 F G63 0
715 3 F G73 0
717 2 E101 1
751 3 E121 1
772 2 E77 0
776 3 F38 0
823 3 E121 1

In [7]:
Pclass = 3
data = rand(3,len(t[Pclass]))
data[0,:] = pd.Series(t[Pclass]['Survived'])
data[1,:] = pd.Series(t[Pclass]['Age'])
data[2,:] = pd.Series(t[Pclass]['Fare'])

plt.figure()
R = corrcoef(data)
plt.pcolor(abs(R))
plt.colorbar()
plt.show()



In [29]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3,min_samples_leaf=5)
clf = clf.fit(ttt[['SexL','isClass1','isClass2','intercept']],ttt['Survived'])
export_graphviz(clf, out_file="essai_dot_data_tree",feature_names=['SexL','isClass1','isClass2','intercept'])

In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn import cross_validation

for i in range(1,len(cols)):
    forest = RandomForestClassifier(n_estimators = 100)
    scores = cross_validation.cross_val_score(forest, pd[cols], pd['Label'], cv=5)
    forest = forest.fit(ttt[cols[0:i]],ttt['Survived'])
    print(i,forest.score(ttt[cols[0:i]],ttt['Survived']),cols[0:i])
print(forest.feature_importances_)
test.isnull().any(axis=1)#type(export_graphviz(forest.estimators_[0]))
#export_graphviz(forest.estimators_[0], out_file="essai_dot_data",feature_names=['SexL','isClass1','isClass2','intercept'])


(1, 0.78011204481792717, ['SexL'])
(2, 0.78011204481792717, ['SexL', 'isClass1'])
(3, 0.79131652661064422, ['SexL', 'isClass1', 'isClass2'])
(4, 0.79131652661064422, ['SexL', 'isClass1', 'isClass2', 'isClass3'])
(5, 0.89635854341736698, ['SexL', 'isClass1', 'isClass2', 'isClass3', 'Age'])
(6, 0.98459383753501406, ['SexL', 'isClass1', 'isClass2', 'isClass3', 'Age', 'Fare'])
[ 0.26285978  0.03225961  0.02006384  0.05866299  0.32051447  0.30563931]
Out[59]:
0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12    False
13     True
14    False
...
403     True
404    False
405    False
406     True
407    False
408     True
409     True
410     True
411    False
412     True
413     True
414    False
415     True
416     True
417     True
Length: 418, dtype: bool

In [378]:
in_rows_logit = ['SexM', 'relativeA','isClass2','isClass3','EmbarkedQ','EmbarkedS','AgeC']
in_rows_logit.append('intercept')
logit = sm.Logit(titanic['Survived'], titanic[in_rows_logit])
result=logit.fit()


Optimization terminated successfully.
         Current function value: 0.418902
         Iterations 6

In [378]:


In [379]:
def pz(passager,coeff):
    z=0
    for i in coeff.index:
        z += coeff[i]*passager[i]
    return 1/(1+exp(-1*z))

In [380]:
essai = []
for i in ttt.iterrows():
    prev=pz(i[1],coeff)
    essai.append([prev,(prev>.5)==i[1]["Survived"] , prev, (prev>.6), (prev<.4)])
float(reduce(lambda p,c:p+c[1],essai,0))/len(ttt)


Out[380]:
0.8080808080808081

In [380]:


In [381]:
soumission = []
for i in test.iterrows():
    prev=pz(i[1],coeff)
    soumission.append([i[1]["PassengerId"],(prev>.5) , prev, (prev>.6), (prev<.4)])

In [382]:
reduce(lambda p,c : p + (0,1)[c[1]],soumission,0)


Out[382]:
156

In [376]:


In [238]:


In [249]:
import csv
myfile = open('th.csv', 'wb')
wr = csv.writer(myfile,delimiter=',',quoting=csv.QUOTE_NONE, escapechar =" ")
wr.writerow(('PassengerId', 'Survived'))
wr.writerows([(x[0], int(x[1])) for x in soumission])
myfile.close()

In [239]:


In [249]:


In [368]:


In [35]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import cross_validation
import itertools

cols = ['SexM','FareC','isClass1','isClass2','AgeC','relative','relativeA','ticket_nb','Embarkednan','isClass3','AgeD','EmbarkedQ','SibSp','Parch','EmbarkedS','EmbarkedC','intercept']
the_cols = cols[0:-1]
in_rows = []

for count in range(0,len(the_cols)):
    result = []
    for i in the_cols:  
        forest = RandomForestClassifier(n_estimators = 100)
        #forest = ExtraTreesClassifier(bootstrap=False, compute_importances=None,criterion='gini', max_depth=None, max_features='auto',max_leaf_nodes=None, min_density=None, min_samples_leaf=1,min_samples_split=2, n_estimators=100, n_jobs=1,oob_score=False, random_state=None, verbose=0)
        scores = cross_validation.cross_val_score(forest, titanic[in_rows + [i]], titanic['Survived'], cv=3)
        result.append([scores, i])
        #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2),i,len(in_rows))
    #forest = forest.fit(titanic[cols[0:i]], titanic['Survived'])
    #print(sorted(zip(cols[0:i],forest.feature_importances_),key=lambda x:-x[1]))
    result=sorted(result,key=lambda x:-x[0].mean()+2*x[0].std())
    actual = result[0]
    in_rows.append(actual[1])
    del the_cols[the_cols.index(actual[1])]

print(in_rows)
#type(export_graphviz(forest.estimators_[0]))
#export_graphviz(forest.estimators_[0], out_file="essai_dot_data",feature_names=['SexL','isClass1','isClass2','intercept'])


['SexM', 'relativeA', 'isClass3', 'SibSp', 'relative', 'Embarkednan', 'ticket_nb', 'Parch', 'EmbarkedS', 'isClass2', 'EmbarkedQ', 'isClass1', 'EmbarkedC', 'FareC', 'AgeC', 'AgeD']

In [303]:
import csv
n_dim=3
print(in_rows[0:n_dim])

forest = forest.fit(titanic[list(('SexM', 'relative', 'EmbarkedQ', 'Embarkednan', 'isClass2','isClass3'))], titanic['Survived'])
ab_n = zip(test["PassengerId"],forest.predict(test[list(('SexM', 'relative', 'EmbarkedQ', 'Embarkednan', 'isClass2','isClass3'))]))
myfile = open('rf.csv', 'wb')
wr = csv.writer(myfile,delimiter=',',quoting=csv.QUOTE_NONE, escapechar =" ")
wr.writerow(('PassengerId', 'Survived'))
wr.writerows(((x[0], int(x[1])) for x in ab))
myfile.close()


['SexM', 'relative', 'Embarkednan']

In [38]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import cross_validation
import itertools

#cols = ['SexM','FareC','isClass1','isClass2','AgeC','relative','Embarked','isClass3','AgeD','EmbarkedQ','SibSp','Parch','EmbarkedS','EmbarkedC','intercept']
the_cols = in_rows[0:7]+['SexL']
result = []
for i in itertools.combinations(the_cols,5):  
    #forest = RandomForestClassifier(n_estimators = 100)
    forest = ExtraTreesClassifier(bootstrap=False, compute_importances=None,criterion='gini', max_depth=None, max_features='auto',max_leaf_nodes=None, min_density=None, min_samples_leaf=1,min_samples_split=2, n_estimators=100, n_jobs=1,oob_score=False, random_state=None, verbose=0)
    scores = cross_validation.cross_val_score(forest, titanic[list(i)], titanic['Survived'], cv=5)
    result.append([scores, i])
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2),i,len(in_rows))


('Accuracy: 0.82 (+/- 0.03)', ('SexM', 'relativeA', 'isClass3', 'SibSp', 'relative'), 16)
('Accuracy: 0.83 (+/- 0.03)', ('SexM', 'relativeA', 'isClass3', 'SibSp', 'Embarkednan'), 16)
('Accuracy: 0.83 (+/- 0.03)', ('SexM', 'relativeA', 'isClass3', 'SibSp', 'ticket_nb'), 16)
('Accuracy: 0.82 (+/- 0.02)', ('SexM', 'relativeA', 'isClass3', 'SibSp', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.02)', ('SexM', 'relativeA', 'isClass3', 'relative', 'Embarkednan'), 16)
('Accuracy: 0.83 (+/- 0.02)', ('SexM', 'relativeA', 'isClass3', 'relative', 'ticket_nb'), 16)
('Accuracy: 0.84 (+/- 0.02)', ('SexM', 'relativeA', 'isClass3', 'relative', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.02)', ('SexM', 'relativeA', 'isClass3', 'Embarkednan', 'ticket_nb'), 16)
('Accuracy: 0.84 (+/- 0.03)', ('SexM', 'relativeA', 'isClass3', 'Embarkednan', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.02)', ('SexM', 'relativeA', 'isClass3', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.82 (+/- 0.04)', ('SexM', 'relativeA', 'SibSp', 'relative', 'Embarkednan'), 16)
('Accuracy: 0.83 (+/- 0.04)', ('SexM', 'relativeA', 'SibSp', 'relative', 'ticket_nb'), 16)
('Accuracy: 0.83 (+/- 0.05)', ('SexM', 'relativeA', 'SibSp', 'relative', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.04)', ('SexM', 'relativeA', 'SibSp', 'Embarkednan', 'ticket_nb'), 16)
('Accuracy: 0.83 (+/- 0.06)', ('SexM', 'relativeA', 'SibSp', 'Embarkednan', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.06)', ('SexM', 'relativeA', 'SibSp', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.04)', ('SexM', 'relativeA', 'relative', 'Embarkednan', 'ticket_nb'), 16)
('Accuracy: 0.83 (+/- 0.04)', ('SexM', 'relativeA', 'relative', 'Embarkednan', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.04)', ('SexM', 'relativeA', 'relative', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.04)', ('SexM', 'relativeA', 'Embarkednan', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.82 (+/- 0.02)', ('SexM', 'isClass3', 'SibSp', 'relative', 'Embarkednan'), 16)
('Accuracy: 0.83 (+/- 0.03)', ('SexM', 'isClass3', 'SibSp', 'relative', 'ticket_nb'), 16)
('Accuracy: 0.82 (+/- 0.03)', ('SexM', 'isClass3', 'SibSp', 'relative', 'SexL'), 16)
('Accuracy: 0.82 (+/- 0.04)', ('SexM', 'isClass3', 'SibSp', 'Embarkednan', 'ticket_nb'), 16)
('Accuracy: 0.82 (+/- 0.05)', ('SexM', 'isClass3', 'SibSp', 'Embarkednan', 'SexL'), 16)
('Accuracy: 0.82 (+/- 0.02)', ('SexM', 'isClass3', 'SibSp', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.02)', ('SexM', 'isClass3', 'relative', 'Embarkednan', 'ticket_nb'), 16)
('Accuracy: 0.82 (+/- 0.04)', ('SexM', 'isClass3', 'relative', 'Embarkednan', 'SexL'), 16)
('Accuracy: 0.84 (+/- 0.02)', ('SexM', 'isClass3', 'relative', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.81 (+/- 0.03)', ('SexM', 'isClass3', 'Embarkednan', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.04)', ('SexM', 'SibSp', 'relative', 'Embarkednan', 'ticket_nb'), 16)
('Accuracy: 0.81 (+/- 0.05)', ('SexM', 'SibSp', 'relative', 'Embarkednan', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.05)', ('SexM', 'SibSp', 'relative', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.82 (+/- 0.06)', ('SexM', 'SibSp', 'Embarkednan', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.83 (+/- 0.04)', ('SexM', 'relative', 'Embarkednan', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.71 (+/- 0.09)', ('relativeA', 'isClass3', 'SibSp', 'relative', 'Embarkednan'), 16)
('Accuracy: 0.71 (+/- 0.09)', ('relativeA', 'isClass3', 'SibSp', 'relative', 'ticket_nb'), 16)
('Accuracy: 0.79 (+/- 0.03)', ('relativeA', 'isClass3', 'SibSp', 'relative', 'SexL'), 16)
('Accuracy: 0.71 (+/- 0.09)', ('relativeA', 'isClass3', 'SibSp', 'Embarkednan', 'ticket_nb'), 16)
('Accuracy: 0.79 (+/- 0.03)', ('relativeA', 'isClass3', 'SibSp', 'Embarkednan', 'SexL'), 16)
('Accuracy: 0.80 (+/- 0.03)', ('relativeA', 'isClass3', 'SibSp', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.72 (+/- 0.08)', ('relativeA', 'isClass3', 'relative', 'Embarkednan', 'ticket_nb'), 16)
('Accuracy: 0.81 (+/- 0.02)', ('relativeA', 'isClass3', 'relative', 'Embarkednan', 'SexL'), 16)
('Accuracy: 0.81 (+/- 0.02)', ('relativeA', 'isClass3', 'relative', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.81 (+/- 0.02)', ('relativeA', 'isClass3', 'Embarkednan', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.69 (+/- 0.05)', ('relativeA', 'SibSp', 'relative', 'Embarkednan', 'ticket_nb'), 16)
('Accuracy: 0.80 (+/- 0.05)', ('relativeA', 'SibSp', 'relative', 'Embarkednan', 'SexL'), 16)
('Accuracy: 0.80 (+/- 0.05)', ('relativeA', 'SibSp', 'relative', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.79 (+/- 0.05)', ('relativeA', 'SibSp', 'Embarkednan', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.80 (+/- 0.06)', ('relativeA', 'relative', 'Embarkednan', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.71 (+/- 0.09)', ('isClass3', 'SibSp', 'relative', 'Embarkednan', 'ticket_nb'), 16)
('Accuracy: 0.79 (+/- 0.02)', ('isClass3', 'SibSp', 'relative', 'Embarkednan', 'SexL'), 16)
('Accuracy: 0.80 (+/- 0.02)', ('isClass3', 'SibSp', 'relative', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.80 (+/- 0.02)', ('isClass3', 'SibSp', 'Embarkednan', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.81 (+/- 0.03)', ('isClass3', 'relative', 'Embarkednan', 'ticket_nb', 'SexL'), 16)
('Accuracy: 0.80 (+/- 0.05)', ('SibSp', 'relative', 'Embarkednan', 'ticket_nb', 'SexL'), 16)

In [135]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,AdaBoostClassifier)
from sklearn import cross_validation
from sklearn import clone
# Parameters
n_classes = 2
n_estimators = 100
RANDOM_SEED = 45  # fix the seed on each iteration


models = [
          DecisionTreeClassifier(max_depth=10),
          #RandomForestClassifier(n_estimators=n_estimators),
          #ExtraTreesClassifier(n_estimators=5),
          #AdaBoostClassifier(DecisionTreeClassifier(max_depth=10),n_estimators=n_estimators)
]

cols=['SexM', 'relativeA', 'isClass3', 'SibSp', 'relative', 'ticket_nb', 'Parch', 'EmbarkedS', 'isClass2', 'EmbarkedQ', 'isClass1', 'EmbarkedC', 'FareC', 'AgeC', 'AgeD']
for i in range(1,len(cols)):
    X_train, X_test,y_train , y_test = cross_validation.train_test_split(titanic[cols[0:i]],titanic['Survived'],test_size=0.15,random_state=12312)
    idx = np.arange(X_train.shape[0])
    np.random.seed(RANDOM_SEED)
    np.random.shuffle(idx)
    X_train = X_train[idx]
    y_train = y_train[idx]
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    X_train = (X_train - mean) / std
    # Train
    clf = clone(model)
    clf = model.fit(X_train, y_train)
    over_fit_score = clf.score(X_train, y_train)
    scores = clf.score(X_test, y_test)
    print(scores,over_fit_score)
import csv
forest = forest.fit(titanic[cols], titanic['Survived'])
ab_e = zip(test["PassengerId"],forest.predict(test[cols]))
#myfile = open('extra.csv', 'wb')
#wr = csv.writer(myfile,delimiter=',',quoting=csv.QUOTE_NONE, escapechar =" ")
#wr.writerow(('PassengerId', 'Survived'))
#wr.writerows(((x[0], int(x[1])) for x in ab_e))
#myfile.close()


(0.58208955223880599, 0.78996036988110963)
(0.58208955223880599, 0.83091149273447817)
(0.58208955223880599, 0.84015852047556139)
(0.58208955223880599, 0.84147952443857332)
(0.61194029850746268, 0.84676354029062084)
(0.59701492537313428, 0.84676354029062084)
(0.59701492537313428, 0.84676354029062084)
(0.59701492537313428, 0.86129458388375169)
(0.59701492537313428, 0.86393659180977544)
(0.59701492537313428, 0.86525759577278727)
(0.58208955223880599, 0.8665785997357992)
(0.58955223880597019, 0.8665785997357992)
(0.66417910447761197, 0.90752972258916775)
(0.56716417910447758, 0.93394980184940557)

In [88]:
my_tickets = {}
for passager in titanic.iterrows():
    if passager[1].Ticket in my_tickets:
        my_tickets[passager[1].Ticket] += 1
    else:
        my_tickets[passager[1].Ticket] = 0

In [111]:
import re
regex = re.compile("([A-I])([0-9])+")
for cabin in titanic[titanic["Cabin"].notnull()].Cabin.unique():
    all = regex.findall(cabin)
    if len(all) > 0:
        print(cabin,all[0][0],all[0][1],ord(all[0][0])-64,int(all[0][1])%2)


('C85', 'C', '5', 3, 1)
('C123', 'C', '3', 3, 1)
('E46', 'E', '6', 5, 0)
('G6', 'G', '6', 7, 0)
('C103', 'C', '3', 3, 1)
('D56', 'D', '6', 4, 0)
('A6', 'A', '6', 1, 0)
('C23 C25 C27', 'C', '3', 3, 1)
('B78', 'B', '8', 2, 0)
('D33', 'D', '3', 4, 1)
('B30', 'B', '0', 2, 0)
('C52', 'C', '2', 3, 0)
('B28', 'B', '8', 2, 0)
('C83', 'C', '3', 3, 1)
('F33', 'F', '3', 6, 1)
('F G73', 'G', '3', 7, 1)
('E31', 'E', '1', 5, 1)
('A5', 'A', '5', 1, 1)
('D10 D12', 'D', '0', 4, 0)
('D26', 'D', '6', 4, 0)
('C110', 'C', '0', 3, 0)
('B58 B60', 'B', '8', 2, 0)
('E101', 'E', '1', 5, 1)
('F E69', 'E', '9', 5, 1)
('D47', 'D', '7', 4, 1)
('B86', 'B', '6', 2, 0)
('F2', 'F', '2', 6, 0)
('C2', 'C', '2', 3, 0)
('E33', 'E', '3', 5, 1)
('B19', 'B', '9', 2, 1)
('A7', 'A', '7', 1, 1)
('C49', 'C', '9', 3, 1)
('F4', 'F', '4', 6, 0)
('A32', 'A', '2', 1, 0)
('B4', 'B', '4', 2, 0)
('B80', 'B', '0', 2, 0)
('A31', 'A', '1', 1, 1)
('D36', 'D', '6', 4, 0)
('D15', 'D', '5', 4, 1)
('C93', 'C', '3', 3, 1)
('C78', 'C', '8', 3, 0)
('D35', 'D', '5', 4, 1)
('C87', 'C', '7', 3, 1)
('B77', 'B', '7', 2, 1)
('E67', 'E', '7', 5, 1)
('B94', 'B', '4', 2, 0)
('C125', 'C', '5', 3, 1)
('C99', 'C', '9', 3, 1)
('C118', 'C', '8', 3, 0)
('D7', 'D', '7', 4, 1)
('A19', 'A', '9', 1, 1)
('B49', 'B', '9', 2, 1)
('C22 C26', 'C', '2', 3, 0)
('C106', 'C', '6', 3, 0)
('C65', 'C', '5', 3, 1)
('E36', 'E', '6', 5, 0)
('C54', 'C', '4', 3, 0)
('B57 B59 B63 B66', 'B', '7', 2, 1)
('C7', 'C', '7', 3, 1)
('E34', 'E', '4', 5, 0)
('C32', 'C', '2', 3, 0)
('B18', 'B', '8', 2, 0)
('C124', 'C', '4', 3, 0)
('C91', 'C', '1', 3, 1)
('E40', 'E', '0', 5, 0)
('C128', 'C', '8', 3, 0)
('D37', 'D', '7', 4, 1)
('B35', 'B', '5', 2, 1)
('E50', 'E', '0', 5, 0)
('C82', 'C', '2', 3, 0)
('B96 B98', 'B', '6', 2, 0)
('E10', 'E', '0', 5, 0)
('E44', 'E', '4', 5, 0)
('A34', 'A', '4', 1, 0)
('C104', 'C', '4', 3, 0)
('C111', 'C', '1', 3, 1)
('C92', 'C', '2', 3, 0)
('E38', 'E', '8', 5, 0)
('D21', 'D', '1', 4, 1)
('E12', 'E', '2', 5, 0)
('E63', 'E', '3', 5, 1)
('A14', 'A', '4', 1, 0)
('B37', 'B', '7', 2, 1)
('C30', 'C', '0', 3, 0)
('D20', 'D', '0', 4, 0)
('B79', 'B', '9', 2, 1)
('E25', 'E', '5', 5, 1)
('D46', 'D', '6', 4, 0)
('B73', 'B', '3', 2, 1)
('C95', 'C', '5', 3, 1)
('B38', 'B', '8', 2, 0)
('B39', 'B', '9', 2, 1)
('B22', 'B', '2', 2, 0)
('C86', 'C', '6', 3, 0)
('C70', 'C', '0', 3, 0)
('A16', 'A', '6', 1, 0)
('C101', 'C', '1', 3, 1)
('C68', 'C', '8', 3, 0)
('A10', 'A', '0', 1, 0)
('E68', 'E', '8', 5, 0)
('B41', 'B', '1', 2, 1)
('A20', 'A', '0', 1, 0)
('D19', 'D', '9', 4, 1)
('D50', 'D', '0', 4, 0)
('D9', 'D', '9', 4, 1)
('A23', 'A', '3', 1, 1)
('B50', 'B', '0', 2, 0)
('A26', 'A', '6', 1, 0)
('D48', 'D', '8', 4, 0)
('E58', 'E', '8', 5, 0)
('C126', 'C', '6', 3, 0)
('B71', 'B', '1', 2, 1)
('B51 B53 B55', 'B', '1', 2, 1)
('D49', 'D', '9', 4, 1)
('B5', 'B', '5', 2, 1)
('B20', 'B', '0', 2, 0)
('F G63', 'G', '3', 7, 1)
('C62 C64', 'C', '2', 3, 0)
('E24', 'E', '4', 5, 0)
('C90', 'C', '0', 3, 0)
('C45', 'C', '5', 3, 1)
('E8', 'E', '8', 5, 0)
('B101', 'B', '1', 2, 1)
('D45', 'D', '5', 4, 1)
('C46', 'C', '6', 3, 0)
('D30', 'D', '0', 4, 0)
('E121', 'E', '1', 5, 1)
('D11', 'D', '1', 4, 1)
('E77', 'E', '7', 5, 1)
('F38', 'F', '8', 6, 0)
('B3', 'B', '3', 2, 1)
('D6', 'D', '6', 4, 0)
('B82 B84', 'B', '2', 2, 0)
('D17', 'D', '7', 4, 1)
('A36', 'A', '6', 1, 0)
('B102', 'B', '2', 2, 0)
('B69', 'B', '9', 2, 1)
('E49', 'E', '9', 5, 1)
('C47', 'C', '7', 3, 1)
('D28', 'D', '8', 4, 0)
('E17', 'E', '7', 5, 1)
('A24', 'A', '4', 1, 0)
('C50', 'C', '0', 3, 0)
('B42', 'B', '2', 2, 0)
('C148', 'C', '8', 3, 0)

In [98]:


In [98]:


In [ ]: