notebook.community

Edit and run



In [ ]:

    
# Load data
import pandas as pd
import numpy as np
df = pd.read_csv('/Users/annette/Desktop/IntroToDataScienceClass/Lesson1/Numpy and Pandas/TitanicData.csv')



In [ ]:

    
# Get another copy of df
# Extract Salutation
#This function takes a Name, splits it by a comma (,), then splits it by a dot(.) and removes the whitespaces. 
#The output of calling function with ‘Jain, Mr. Kunal’ would be Mr and  
#‘Jain, Miss. Jenika’ would be Miss
def name_extract(word):
 return word.split(',')[1].split('.')[0].strip()

#Next, we apply this function to the entire column using apply() 
#function and convert the outcome to a new DataFrame df2:
df2 = pd.DataFrame({'Salutation':df['Name'].apply(name_extract)})

# Merge new Salutation column to the original data frame
df2 = pd.merge(df, df2, left_index = True, right_index = True) # merges on index



In [ ]:

    
# Look at distribution
temp1 = df2.groupby('Salutation').PassengerId.count()
print temp1



In [ ]:

    
# Clean up data
df2['Salutation'][df2.Salutation == 'Jonkheer'] = 'Master'
df2['Salutation'][df2.Salutation.isin(['Ms','Mlle'])] = 'Miss'
df2['Salutation'][df2.Salutation == 'Mme'] = 'Mrs'
df2['Salutation'][df2.Salutation.isin(['Capt', 'Don', 'Major', 'Col', 'Sir'])] = 'Sir'
df2['Salutation'][df2.Salutation.isin(['Dona', 'Lady', 'the Countess'])] = 'Lady'



In [ ]:

    
%matplotlib inline
df2.boxplot(column='Age', by = 'Salutation')



In [ ]:

    
# Create a table with median values for different class and salutation combinations
table = df2.pivot_table(values='Age', rows=['Salutation'], cols=['Pclass'], aggfunc=np.median)
print table

# Define function to return value of this pivot_table
def f(x):
    return table[x['Pclass']][x['Salutation']]

# Replace missing values
# (1) fillna -> you will replace na with a value
# (2) value is taken from function that refers to table
# (3) In function you access the class and the salutation of the passenger and return median age
# Applying method to each row
df2['Age'].fillna(df2[df2['Age'].isnull()].apply(f, axis=1), inplace=True)



In [ ]:

    
# Create Dummy Variables
dummy_Sex = pd.get_dummies(df2['Sex'],prefix='Sex')
dummy_PClass = pd.get_dummies(df2['Pclass'],prefix='PClass')
dummy_Salutation= pd.get_dummies(df2['Salutation'],prefix='Salutation')


cols_to_keep = ['Age','Fare','SibSp','Parch']
df3 = df2[cols_to_keep].join([dummy_Sex,dummy_PClass,dummy_Salutation])



In [ ]:

    
#Create Training and Test Set
X = df3
Y = df2['Survived']
from sklearn import cross_validation
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.4, random_state = 49)



In [ ]:

    
# Use Decision Tree Classifier 
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)

accuracy = clf.score(X_test, Y_test)
print("Accuracy: %0.2f" % (accuracy))



In [ ]:

    
# Use Random Forest
from sklearn.ensemble import RandomForestClassifier

#create and train the random forest
rf = RandomForestClassifier(n_estimators=200,n_jobs = -1,random_state=121873)
rf.fit(X_train, Y_train)

Y_pred = rf.predict(X_test);
print rf.score(X_test, Y_test)



In [ ]:

    
from sklearn import cross_validation
scores = cross_validation.cross_val_score(clf,X,Y,cv=5,scoring='accuracy')
print("Decision Tree Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) 

scores = cross_validation.cross_val_score(rf,X,Y,cv=5,scoring='accuracy')
print("Random Forest Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))