Data Organization (Data Wrangling)


In [37]:
# Imports for pandas, and numpy

import numpy as np
import pandas as pd

# imports for seaborn to and matplotlib to allow graphing
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline

# import Titanic CSV  -  NOTE: adjust file path as neccessary
dTitTrain_DF = pd.read_csv('train.csv')

# Clearing of Columns not neccesary for statistical analysis
dTitTrain_DF = dTitTrain_DF.drop(["Name", "Ticket"], axis=1)

In [38]:
dTitTrain_DF.info()
dTitTrain_DF.describe()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(3)
memory usage: 76.6+ KB
Out[38]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

In [39]:
titAge  = dTitTrain_DF.dropna(subset=['Age'])

In [40]:
# Distribution gender (adult and male)
ACmenData = dTitTrain_DF[dTitTrain_DF.Sex == 'male']
ACwomenData = dTitTrain_DF[dTitTrain_DF.Sex == 'female']

ACmenDataCount = float(ACmenData['Sex'].count())
ACwomenDataCount = float(ACwomenData['Sex'].count())

# Gender Specific DFs
AmenData = dTitTrain_DF[dTitTrain_DF.Sex == 'male'][dTitTrain_DF.Age >= 21]
AwomenData = dTitTrain_DF[dTitTrain_DF.Sex == 'female'][dTitTrain_DF.Age >= 21]

AmenDataCount = float(AmenData['Sex'].count())
AwomenDataCount = float(AwomenData['Sex'].count())

# print(menDataCount)
# print(womenDataCount)

In [41]:
# Age Specific Groups
adultData = titAge[titAge.Age >= 21]
childData = titAge[titAge.Age < 21]

adultDataCount = float(adultData['Age'].count())
childDataCount = float(childData['Age'].count())

#print(childDataCount)
#print(adultDataCount)

In [42]:
# Pclass

titClass1 = dTitTrain_DF[dTitTrain_DF.Pclass == 1]
titClass2 = dTitTrain_DF[dTitTrain_DF.Pclass == 2]
titClass3 = dTitTrain_DF[dTitTrain_DF.Pclass == 3]

In [43]:
# Alone or Family

dTitTrain_DF['SoloOrFamily'] = dTitTrain_DF.SibSp + dTitTrain_DF.Parch
dTitTrain_DF['SoloOrFamily'].loc[dTitTrain_DF['SoloOrFamily'] > 0] = 'Family'
dTitTrain_DF['SoloOrFamily'].loc[dTitTrain_DF['SoloOrFamily'] == 0] = 'Alone'

In [44]:
# Survivor Column (Yes or no)
dTitTrain_DF['Survivor']= dTitTrain_DF.Survived.map({0:'No', 1:'Yes'})

In [45]:
titCabin  = dTitTrain_DF.dropna(subset=['Cabin'])

In [46]:
# Locational Data Groups
titDecks = titCabin['Cabin']


def deckGrab(tDK, cabLetter):
    deckLevels = []

    for level in tDK:
        deckLevels.append(level[0])
    
    TDF = pd.DataFrame(deckLevels)
    TDF.columns = ['Cabin']

    TDF = TDF[TDF.Cabin == cabLetter]
    
    return TDF


def deckCount(tDK, cabLetter):
    TDF = deckGrab(tDK, cabLetter)
    return TDF[TDF.Cabin == cabLetter].count()['Cabin']
    

# print(deckCount(titDecks, "A"))
# print(deckCount(titDecks, "B"))
# print(deckCount(titDecks, "C"))
# print(deckCount(titDecks, "D"))
# print(deckCount(titDecks, "E"))
# print(deckCount(titDecks, "F"))
# print(deckCount(titDecks, "G"))

In [47]:
# embarked

titCherbourg = dTitTrain_DF[dTitTrain_DF.Embarked == 'C']
titQueenstown = dTitTrain_DF[dTitTrain_DF.Embarked == 'Q']
titSouthampton = dTitTrain_DF[dTitTrain_DF.Embarked == 'S']

Distribution of Passengers

Gender - Analysis | Graph

Distribution of Genders in Passenger Population


In [48]:
printG  =  "Men account for " + str(ACmenDataCount) +  " and "  + "Women account for " + str(ACwomenDataCount) + " (Total Passengers: " + str(dTitTrain_DF.count()['Age']) + ")"
print(printG)


Men account for 577.0 and Women account for 314.0 (Total Passengers: 714)

In [49]:
gSSC = sns.factorplot('Sex', data=dTitTrain_DF, kind='count')
gSSC.despine(left=True)
gSSC.set_ylabels("count of passengers")


Out[49]:
<seaborn.axisgrid.FacetGrid at 0xca10588>

Distribution of Genders in pClass populations


In [50]:
gGCSC= sns.factorplot('Pclass',order=[1,2,3], data=dTitTrain_DF, hue='Sex', kind='count')

gGCSC.despine(left=True)
gGCSC.set_ylabels("count of passengers")


Out[50]:
<seaborn.axisgrid.FacetGrid at 0x3cf1c88>

Age - Analysis | Graph


In [51]:
printA = "Youngest Passenger in the passenger list was " + str(titAge['Age'].min()) + " years of age." \
    + "\n" + "Oldest Passenger in the passenger list was " + str(titAge['Age'].max()) + " years of age." \
    + "\n" + "Mean of Passengers ages in the passenger list is " + str(titAge['Age'].mean()) + " years of age."

print(printA)


Youngest Passenger in the passenger list was 0.42 years of age.
Oldest Passenger in the passenger list was 80.0 years of age.
Mean of Passengers ages in the passenger list is 29.6991176471 years of age.

Distrbution of Age in passenger population


In [52]:
titAge['Age'].hist(bins=80)


Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0xeb9ccc0>

Distribution of Age in pClass population


In [53]:
gCPS = sns.FacetGrid(titAge,hue='Pclass', aspect=4, hue_order=[1,2,3])
gCPS.map(sns.kdeplot,'Age', shade=True)

gCPS.set(xlim=(0,titAge['Age'].max()))
gCPS.add_legend()


Out[53]:
<seaborn.axisgrid.FacetGrid at 0xeba6390>

Distribution of passengers into adult and children age groups (Child = less than 21 years of age)

Reference:

Source: http://history.stackexchange.com/questions/17481/what-was-the-age-of-majority-in-1900-united-states

By the common law the age of majority is fixed at twenty-one years for both sexes, and, in the absence of any statute to >the contrary, every person under that age, whether male or female, is an infant. (21)

-- The American and English Encyclopedia of Law, Garland and McGeehee, 1900

By the common law, every person is, technically, an infant, until he is twenty-one years old; and, in legal presumption, is >not of sufficient discretion to contract an obligation at an earlier age.

-- Institutes of the Lawes of England by Coke (1628-1644). The laws on infants are at 171b.


In [54]:
# splits passengers into 3 categories (male of female if considered adult, and child if below 21 of age)

def minorOrAdult(passenger):
    age, sex = passenger
    
    if age < 21:
        return 'child'
    else:
        return sex

# adds new column to dataframe that distinguishes a passenger as a child or an adult

dTitTrain_DF['PersonStatus'] = dTitTrain_DF[['Age', 'Sex']].apply(minorOrAdult, axis=1)

In [55]:
dTitTrain_DF['PersonStatus'].value_counts()


Out[55]:
male      474
female    237
child     180
Name: PersonStatus, dtype: int64

Distribution of child and adult (male and female) age groups by age


In [56]:
gACPS = sns.FacetGrid(dTitTrain_DF, hue='PersonStatus', aspect=4, hue_order=['child', 'male', 'female'])
gACPS.map(sns.kdeplot,'Age', shade=True)

gACPS.set(xlim=(0,titAge['Age'].max()))
gACPS.add_legend()


Out[56]:
<seaborn.axisgrid.FacetGrid at 0xef226d8>

Distribution of child and adult (male and female) by pClass


In [57]:
gGAC= sns.factorplot('Pclass', order=[1,2,3], data=dTitTrain_DF, hue='PersonStatus', kind='count',hue_order=['child','male','female'])

gGAC.despine(left=True)
gGAC.set_ylabels("count of passengers")


Out[57]:
<seaborn.axisgrid.FacetGrid at 0xeed55f8>

Alone or Family


In [58]:
sns.factorplot('SoloOrFamily', data=dTitTrain_DF, kind='count')

print("Alone: " + str(dTitTrain_DF[dTitTrain_DF.SoloOrFamily == "Alone"].count()['SoloOrFamily']))
print("Family: " + str(dTitTrain_DF[dTitTrain_DF.SoloOrFamily == "Family"].count()['SoloOrFamily']))


Alone: 537
Family: 354

Locational: Cabin Analysis | Graph


In [59]:
def prepareDeckGraph(titDecksDF):
    deckLevels = []

    for level in titDecksDF:
        deckLevels.append(level[0])

    T_DF = pd.DataFrame(deckLevels)

    T_DF.columns = ['Cabin']

    T_DF = T_DF[T_DF.Cabin != 'T']
    
    return T_DF
    

gTD_DF = prepareDeckGraph(titDecks) 
sns.factorplot('Cabin', order=['A','B','C','D','E','F','G'], data=gTD_DF, kind='count')


print("A: " + str(deckCount(titDecks, "A")))
print("B: " + str(deckCount(titDecks, "B")))
print("C: " + str(deckCount(titDecks, "C")))
print("D: " + str(deckCount(titDecks, "D")))
print("E: " + str(deckCount(titDecks, "E")))
print("F: " + str(deckCount(titDecks, "F")))
print("G: " + str(deckCount(titDecks, "G")))


A: 15
B: 47
C: 59
D: 33
E: 32
F: 13
G: 4

Locational: Disembark Analysis | Graph


In [60]:
sns.factorplot('Embarked', order=['C','Q','S'], data=dTitTrain_DF, hue='Pclass', kind='count', hue_order=[1,2,3])

# titCherbourg 
# titQueenstown 
# titSouthampton 

print("Total:")
print("Cherbourg: " + str(titCherbourg.count()['Embarked']))
print("Queenstown: " + str(titQueenstown.count()['Embarked']))
print("Southampton: " + str(titSouthampton.count()['Embarked']))

print("")

print("Cherbourg: ")
print("Pclass 1 - " + str(titCherbourg[titCherbourg.Pclass == 1].count()['Embarked'])) 
print("Pclass 2 - " + str(titCherbourg[titCherbourg.Pclass == 2].count()['Embarked']))
print("Pclass 3 - " + str(titCherbourg[titCherbourg.Pclass == 3].count()['Embarked']))

print("")

print("Queenstown: ")
print("Pclass 1 - " + str(titQueenstown[titQueenstown.Pclass == 1].count()['Embarked'])) 
print("Pclass 2 - " + str(titQueenstown[titQueenstown.Pclass == 2].count()['Embarked']))
print("Pclass 3 - " + str(titQueenstown[titQueenstown.Pclass == 3].count()['Embarked']))

print("")

print("Southampton: ")
print("Pclass 1 - " + str(titSouthampton[titSouthampton.Pclass == 1].count()['Embarked'])) 
print("Pclass 2 - " + str(titSouthampton[titSouthampton.Pclass == 2].count()['Embarked']))
print("Pclass 3 - " + str(titSouthampton[titSouthampton.Pclass == 3].count()['Embarked']))


Total:
Cherbourg: 168
Queenstown: 77
Southampton: 644

Cherbourg: 
Pclass 1 - 85
Pclass 2 - 17
Pclass 3 - 66

Queenstown: 
Pclass 1 - 2
Pclass 2 - 3
Pclass 3 - 72

Southampton: 
Pclass 1 - 127
Pclass 2 - 164
Pclass 3 - 353

Surivival Graph Comparison

Survival Count (Overall)


In [61]:
# Survivors Overall


gSOA = sns.factorplot('Survivor', data=dTitTrain_DF, kind='count')

gSOA.despine(left=True)
gSOA.set_ylabels("count of passengers")

print("Survivor: " + str(dTitTrain_DF[dTitTrain_DF.Survivor == "Yes"].count()['Survivor']))
print("Non-Survivor: " + str(dTitTrain_DF[dTitTrain_DF.Survivor == "No"].count()['Survivor']))


Survivor: 342
Non-Survivor: 549

Survival by Gender


In [62]:
# Series probability - access probability of survived in men and women
menProb = ACmenData.groupby('Sex').Survived.mean()
womenProb = ACwomenData.groupby('Sex').Survived.mean()

menPercent = menProb[0]*100
womenPercent = womenProb[0]*100

print("Men Survivalbility: ")
print(menProb[0])

print("Women Survivalbility: ")
print(womenProb[0])


gSSP = sns.factorplot("Sex", "Survived", data=dTitTrain_DF, kind="bar", size=5)

gSSP.despine(left=True)
gSSP.set_ylabels("survival probability")


Men Survivalbility: 
0.188908145581
Women Survivalbility: 
0.742038216561
Out[62]:
<seaborn.axisgrid.FacetGrid at 0x1091def0>

Survival by Pclass


In [63]:
# Determines the probability of survival for a given Pclass

def define_pClassProb(dataFrameIN, numClass):
    classEntries = dataFrameIN[dataFrameIN.Pclass == numClass]
    sClassEntries = classEntries[classEntries.Survived == 1]

    cClassEntries = (classEntries.count(numeric_only=True)['Pclass']).astype(float)
    cSClassEntries = (sClassEntries.count(numeric_only=True)['Pclass']).astype(float)

    return (cSClassEntries/cClassEntries)
    

print("Class 1 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 1))

print("Class 2 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 2))

print("Class 3 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 3))


gCS = sns.factorplot("Pclass", "Survived",order=[1,2,3],data=dTitTrain_DF, kind="bar", size=5)

gCS.despine(left=True)
gCS.set_ylabels("survival probability")


Class 1 Survivality: 
0.62962962963
Class 2 Survivality: 
0.472826086957
Class 3 Survivality: 
0.242362525458
Out[63]:
<seaborn.axisgrid.FacetGrid at 0x1052d7f0>

In [64]:
print("Class 1 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 1))

print("Class 2 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 2))

print("Class 3 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 3))

sns.factorplot("Pclass", "Survived",order=[1,2,3], data=dTitTrain_DF, kind='point')


Class 1 Survivality: 
0.62962962963
Class 2 Survivality: 
0.472826086957
Class 3 Survivality: 
0.242362525458
Out[64]:
<seaborn.axisgrid.FacetGrid at 0xc665080>

Survival Pclass and Gender


In [65]:
# determines the probability of survival for genders in a given Pclass

def define_pClassProbSex(dataFrameIN, numClass, sex):
    classEntries = dataFrameIN[dataFrameIN.Pclass == numClass][dataFrameIN.Sex == sex]
    sClassEntries = classEntries[classEntries.Survived == 1]

    cClassEntries = (classEntries.count(numeric_only=True)['Pclass']).astype(float)
    cSClassEntries = (sClassEntries.count(numeric_only=True)['Pclass']).astype(float)

    return (cSClassEntries/cClassEntries)
    

print("Class 1 Survivality(MALE): ")
print(define_pClassProbSex(dTitTrain_DF, 1, 'male'))

print("Class 1 Survivality(FEMALE): ")
print(define_pClassProbSex(dTitTrain_DF, 1, 'female'))


print("Class 2 Survivality(MALE): ")
print(define_pClassProbSex(dTitTrain_DF, 2, 'male'))

print("Class 2 Survivality(FEMALE): ")
print(define_pClassProbSex(dTitTrain_DF, 2, 'female'))

print("Class 3 Survivality(MALE): ")
print(define_pClassProbSex(dTitTrain_DF, 3, 'male'))

print("Class 3 Survivality(FEMALE): ")
print(define_pClassProbSex(dTitTrain_DF, 3, 'female'))


gGCSP = sns.factorplot("Pclass", "Survived",order=[1,2,3],data=dTitTrain_DF,hue='Sex', kind='bar')

gGCSP.despine(left=True)
gGCSP.set_ylabels("survival probability")


Class 1 Survivality(MALE): 
0.368852459016
Class 1 Survivality(FEMALE): 
0.968085106383
Class 2 Survivality(MALE): 
0.157407407407
Class 2 Survivality(FEMALE): 
0.921052631579
Class 3 Survivality(MALE): 
0.135446685879
Class 3 Survivality(FEMALE): 
0.5
Out[65]:
<seaborn.axisgrid.FacetGrid at 0x1134b2e8>

In [66]:
sns.factorplot("Pclass", "Survived", hue='Sex',order=[1,2,3], data=dTitTrain_DF, kind='point')


Out[66]:
<seaborn.axisgrid.FacetGrid at 0x11733278>

Survival By Pclass and Age Group (Adult (Male / Female) / Child)


In [90]:
#Determine probability of survival of children in a given Pclass

def define_pClassChildProb(dataFrameIN, numClass):
    
    ChildDF = dataFrameIN[dataFrameIN.Pclass == numClass][dataFrameIN.PersonStatus == 'child']
    ChildSurvived = dataFrameIN[dataFrameIN.Pclass == numClass][dataFrameIN.PersonStatus == 'child'][dataFrameIN.Survivor == 'Yes']
    
    totalCChild = ChildDF.count()['PassengerId'].astype(float)
    CChildSurvived = ChildSurvived.count()['PassengerId'].astype(float)
    

    return CChildSurvived/totalCChild

def define_pClassAdultProb(dataFrameIN, numClass, sex):
    
    AdultDF = dataFrameIN[dataFrameIN.Pclass == numClass][dataFrameIN.PersonStatus == sex]
    AdultSurvived = dataFrameIN[dataFrameIN.Pclass == numClass][dataFrameIN.PersonStatus == sex][dataFrameIN.Survivor == 'Yes']
    
    totalCAdult = AdultDF.count()['PassengerId'].astype(float)
    CAdultSurvived = AdultSurvived.count()['PassengerId'].astype(float)
    
    return CAdultSurvived/totalCAdult
    
print("PClass 1 Survival Child: ")    
print(define_pClassChildProb(dTitTrain_DF, 1))

print("PClass 1 Survival Female: ")    
print(define_pClassAdultProb(dTitTrain_DF, 1, 'female'))

print("PClass 1 Survival Male: ")    
print(define_pClassAdultProb(dTitTrain_DF, 1, 'male'))

print("-----------")

print("PClass 2 Survival Child: ")    
print(define_pClassChildProb(dTitTrain_DF, 2))

print("PClass 2 Survival Female: ")    
print(define_pClassAdultProb(dTitTrain_DF, 2, 'female'))

print("PClass 2 Survival Male: ")    
print(define_pClassAdultProb(dTitTrain_DF, 2, 'male'))

print("-----------")

print("PClass 3 Survival Child: ")    
print(define_pClassChildProb(dTitTrain_DF, 3))

print("PClass 3 Survival Female: ")    
print(define_pClassAdultProb(dTitTrain_DF, 3, 'female'))

print("PClass 3 Survival Male: ")    
print(define_pClassAdultProb(dTitTrain_DF, 3, 'male'))


sns.factorplot("Pclass", "Survived", hue='PersonStatus',order=[1,2,3], data=dTitTrain_DF, kind='point')


PClass 1 Survival Child: 
0.809523809524
PClass 1 Survival Female: 
0.975
PClass 1 Survival Male: 
0.35652173913
-----------
PClass 2 Survival Child: 
0.742857142857
PClass 2 Survival Female: 
0.9
PClass 2 Survival Male: 
0.0786516853933
-----------
PClass 3 Survival Child: 
0.314516129032
PClass 3 Survival Female: 
0.494845360825
PClass 3 Survival Male: 
0.118518518519
Out[90]:
<seaborn.axisgrid.FacetGrid at 0x10949b70>

Survival by Age Distribution


In [68]:
#sns.lmplot('Age', 'Survived', data=dTitTrain_DF)
pSBA = sns.boxplot(data=dTitTrain_DF, x='Survived', y='Age')
pSBA.set(title='Age Distribution by Survival',
            xlabel = 'Survival',
            ylabel = 'Age Distrobution',
            xticklabels = ['Died', 'Survived'])


Out[68]:
[<matplotlib.text.Text at 0x12028cf8>,
 [<matplotlib.text.Text at 0x11dfc358>, <matplotlib.text.Text at 0x120bb3c8>],
 <matplotlib.text.Text at 0x1201bd30>,
 <matplotlib.text.Text at 0x120a8cf8>]

Survival by Alone or with Family


In [69]:
# Using Solo or family column created earlier in passenger distributions section created a separate dataframes for traveling 
#alone and with family passengers


familyPass = dTitTrain_DF[dTitTrain_DF['SoloOrFamily'] == "Family"]
alonePass = dTitTrain_DF[dTitTrain_DF['SoloOrFamily'] == "Alone"]

# Creates a list of surviving family and alone passengers

AFamilyPass = familyPass[familyPass.Survivor == "Yes"]
AAlonePass = alonePass[alonePass.Survivor == "Yes"]

# Determines the probability of survival for passengers that traveled alone and with family

pAF = float(AFamilyPass['SoloOrFamily'].count()) / float(familyPass['SoloOrFamily'].count())
pAA = float(AAlonePass['SoloOrFamily'].count()) / float(alonePass['SoloOrFamily'].count())

print("Probability of Survival being with Family: ")
print(pAF)

print("")

print("Probability of Survival being alone: ")
print(pAA)

gSSP = sns.factorplot("SoloOrFamily", "Survived", data=dTitTrain_DF, kind="bar", size=5)

gSSP.despine(left=True)
gSSP.set_ylabels("survival probability")


Probability of Survival being with Family: 
0.505649717514

Probability of Survival being alone: 
0.303538175047
Out[69]:
<seaborn.axisgrid.FacetGrid at 0xdafb9b0>

Survival pClass by Age Distribution


In [70]:
#sns.lmplot('Age', 'Survived',hue='Pclass', data=dTitanic_DF, hue_order=[1,2,3])
pACSB = sns.boxplot(data = dTitTrain_DF.dropna(subset = ['Age']).sort_values('Pclass'), x='Pclass', y='Age', hue='Survivor')
pACSB.set(title='Age by Class and Survival - Box Plot', xlabel='Pclass')
pACSB.legend(bbox_to_anchor=(1.05, .7), loc=2, title = 'Survived',borderaxespad=0.)


Out[70]:
<matplotlib.legend.Legend at 0x11d27978>

Survival Gender by Age Distribution


In [71]:
#sns.lmplot('Age', 'Survived', hue='Sex' ,data=dTitanic_DF)
pAGSB = sns.boxplot(data=dTitTrain_DF.dropna(subset = ['Age']), x= 'Sex', y= 'Age', hue='Survivor')
pAGSB.set(title='Age by Gender and Survival - Box Plot')
pAGSB.legend(bbox_to_anchor=(1.05, .7), loc=2, title = 'Survived',borderaxespad=0.)


Out[71]:
<matplotlib.legend.Legend at 0x11d27cf8>

Process CSV - Generation of Estimation Survival Table


In [103]:
# Determining better odds which will be compared to test group  (First comparison - Pclass and age group)

import csv
  
    
# # Manual - Age Group and gender adult with highest above 49%
# print(define_pClassChildProb(dTitTrain_DF, 1))  
# print(define_pClassAdultProb(dTitTrain_DF, 1, 'female'))    

# print(define_pClassChildProb(dTitTrain_DF, 2))
# print(define_pClassAdultProb(dTitTrain_DF, 2, 'female')) 

# print(define_pClassAdultProb(dTitTrain_DF, 3, 'female'))  

# #sibsp and parch


test_file = open('test.csv', 'rb')
test_file_object = csv.reader(test_file)
header = test_file_object.next()

prediction_file = open("genderPclassbasedmodel.csv", "wb")
prediction_file_object = csv.writer(prediction_file)

prediction_file_object.writerow(["PassengerId", "Survived"])
for row in test_file_object:       # For each row in test.csv
    weight = 0.0
    
    if row[1] == 1:                                               
        weight = weight + 9
    elif row[1] == 2:                                    
        weight = weight + 8
    else:
        weight = 5
        
    if row[3] == 'female':
        weight = weight + 8
    else:
        weight = weight + 2
    
    if row[4] < 21:
        # child
        weight = weight + 6
    else:
        # adult
        weight = weight + 5
        
    aFam = row[5] + row[6]
    
    if aFam > 0:
        weight = weight + 5
        
    else:
        weight = weight + 3
        
    weightScore = weight/40.0 
    
    print(str(weightScore))
    
    if(weight >= .5):
        prediction_file_object.writerow([row[0],'1'])
    else:
        prediction_file_object.writerow([row[0],'0'])
    
        
#prediction_file_object.writerow([row[0],'1'])
#prediction_file_object.writerow([row[0],'0'])

test_file.close()
prediction_file.close()


0.425
0.575
0.425
0.425
0.575
0.425
0.575
0.425
0.575
0.425
0.425
0.425
0.575
0.425
0.575
0.575
0.425
0.425
0.575
0.575
0.425
0.425
0.575
0.425
0.575
0.425
0.575
0.425
0.425
0.425
0.425
0.425
0.575
0.575
0.425
0.425
0.575
0.575
0.425
0.425
0.425
0.425
0.425
0.575
0.575
0.425
0.425
0.425
0.575
0.575
0.425
0.425
0.575
0.575
0.425
0.425
0.425
0.425
0.425
0.575
0.425
0.425
0.425
0.575
0.425
0.575
0.575
0.425
0.425
0.575
0.575
0.425
0.575
0.425
0.575
0.425
0.425
0.575
0.425
0.575
0.425
0.425
0.425
0.425
0.425
0.425
0.575
0.575
0.575
0.425
0.575
0.425
0.575
0.425
0.425
0.425
0.575
0.425
0.575
0.425
0.575
0.425
0.425
0.425
0.575
0.425
0.425
0.425
0.425
0.425
0.425
0.575
0.575
0.575
0.575
0.425
0.425
0.575
0.425
0.575
0.575
0.425
0.575
0.425
0.425
0.575
0.425
0.575
0.425
0.425
0.425
0.425
0.575
0.425
0.425
0.425
0.425
0.425
0.575
0.425
0.575
0.575
0.425
0.425
0.425
0.425
0.425
0.425
0.425
0.425
0.575
0.425
0.425
0.575
0.425
0.425
0.575
0.575
0.425
0.575
0.575
0.425
0.575
0.425
0.425
0.575
0.425
0.425
0.575
0.575
0.425
0.425
0.425
0.425
0.425
0.575
0.575
0.425
0.575
0.575
0.425
0.425
0.575
0.425
0.575
0.425
0.575
0.425
0.575
0.425
0.425
0.425
0.425
0.425
0.425
0.425
0.425
0.575
0.425
0.575
0.575
0.425
0.425
0.575
0.425
0.425
0.575
0.425
0.575
0.425
0.425
0.425
0.425
0.575
0.575
0.425
0.575
0.425
0.575
0.425
0.575
0.425
0.575
0.425
0.575
0.575
0.425
0.575
0.425
0.425
0.425
0.575
0.425
0.425
0.425
0.425
0.425
0.425
0.575
0.575
0.575
0.575
0.425
0.425
0.425
0.425
0.575
0.425
0.575
0.575
0.575
0.425
0.425
0.425
0.425
0.425
0.425
0.425
0.575
0.425
0.425
0.425
0.575
0.575
0.425
0.425
0.425
0.425
0.575
0.425
0.425
0.425
0.575
0.575
0.425
0.575
0.425
0.425
0.425
0.425
0.575
0.425
0.575
0.575
0.575
0.425
0.425
0.425
0.425
0.425
0.425
0.575
0.425
0.425
0.425
0.425
0.575
0.425
0.425
0.425
0.425
0.425
0.425
0.425
0.575
0.575
0.425
0.425
0.425
0.575
0.425
0.425
0.425
0.575
0.575
0.575
0.425
0.425
0.425
0.425
0.425
0.425
0.425
0.425
0.575
0.425
0.575
0.425
0.425
0.425
0.575
0.425
0.425
0.575
0.425
0.425
0.425
0.425
0.425
0.425
0.425
0.425
0.425
0.575
0.425
0.575
0.425
0.575
0.425
0.575
0.575
0.425
0.425
0.425
0.575
0.425
0.575
0.425
0.425
0.575
0.425
0.575
0.575
0.425
0.575
0.575
0.425
0.575
0.575
0.425
0.425
0.575
0.425
0.425
0.575
0.575
0.575
0.425
0.425
0.425
0.425
0.425
0.575
0.575
0.425
0.575
0.425
0.425
0.425
0.425
0.425
0.575
0.425
0.425
0.425
0.575
0.425
0.575
0.425
0.425
0.575
0.425
0.575
0.425
0.425
0.425
0.425
0.425
0.575
0.575
0.575
0.575
0.575
0.425
0.575
0.425
0.425
0.425

In [ ]: