In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline

In [2]:
#Win dir
train = pd.read_csv(r'C:\Users\hrao\Documents\Personal\HK\Python\train.csv')
test = pd.read_csv(r'C:\Users\hrao\Documents\Personal\HK\Python\test.csv')

#Mac dir
#train = pd.read_csv(r'/Users/Harish/Documents/HK_Work/Python/Machine-Learning/train.csv')
#test = pd.read_csv(r'/Users/Harish/Documents/HK_Work/Python/Machine-Learning/test.csv')

In [3]:
train.head(n=3)


Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S

In [4]:
train = train.fillna(0)
test = test.fillna(0)

In [5]:
train = train.drop(['Name','SibSp','Parch','Cabin','Ticket'],axis=1)

In [6]:
train.head(n=3)


Out[6]:
PassengerId Survived Pclass Sex Age Fare Embarked
0 1 0 3 male 22.0 7.2500 S
1 2 1 1 female 38.0 71.2833 C
2 3 1 3 female 26.0 7.9250 S

In [7]:
train['Sex'] = train['Sex'].map({'female':1,'male':0})
test['Sex'] = test['Sex'].map({'female':1,'male':0})

In [8]:
train['Embarked'] = train['Embarked'].map({'C':1,'Q':2,'S':3})
test['Embarked'] = test['Embarked'].map({'C':1,'Q':2,'S':3})

In [9]:
train.head(n=3)


Out[9]:
PassengerId Survived Pclass Sex Age Fare Embarked
0 1 0 3 0 22.0 7.2500 3.0
1 2 1 1 1 38.0 71.2833 1.0
2 3 1 3 1 26.0 7.9250 3.0

In [10]:
test = test.drop(['Name','SibSp','Parch','Cabin','Ticket'],axis=1)

In [11]:
test.head(n=3)


Out[11]:
PassengerId Pclass Sex Age Fare Embarked
0 892 3 0 34.5 7.8292 2
1 893 3 1 47.0 7.0000 3
2 894 2 0 62.0 9.6875 2

In [12]:
train['Age']=train['Age'].fillna(0)
test['Age']=test['Age'].fillna(0)
train['Embarked']=train['Embarked'].fillna(0)
test['Embarked']=test['Embarked'].fillna(0)

In [13]:
#train.to_csv(r'C:\Users\hrao\Documents\Personal\HK\Machine-Learning\train_clean_data.csv')
#test.to_csv(r'C:\Users\hrao\Documents\Personal\HK\Machine-Learning\test_clean_data.csv')

In [36]:
X_train = train.drop(["Survived","PassengerId"], axis=1)
Y_train = train["Survived"]
X_test  = test.drop("PassengerId", axis=1).copy()
#X_test = test
X_train.shape, Y_train.shape, X_test.shape


Out[36]:
((891, 5), (891,), (418, 5))

In [37]:
X_train.head(n=5)


Out[37]:
Pclass Sex Age Fare Embarked
0 3 0 22.0 7.2500 3.0
1 1 1 38.0 71.2833 1.0
2 3 1 26.0 7.9250 3.0
3 1 1 35.0 53.1000 3.0
4 3 0 35.0 8.0500 3.0

In [38]:
Y_train.head(n=5)


Out[38]:
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [39]:
X_test.head(n=5)


Out[39]:
Pclass Sex Age Fare Embarked
0 3 0 34.5 7.8292 2
1 3 1 47.0 7.0000 3
2 2 0 62.0 9.6875 2
3 3 0 27.0 8.6625 3
4 3 1 22.0 12.2875 3

In [40]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, Y_train)


Out[40]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
prediction_lr = lr.predict(X_test)
accuracy_lr = lr.score(X_train, Y_train)
accuracy_lr


Out[41]:
0.78675645342312006

In [42]:
prediction_passenger_lr = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':prediction_lr})

In [43]:
from sklearn import svm
lsvc = svm.SVC()
lsvc.fit(X_train, Y_train)


Out[43]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [44]:
prediction_svc = lsvc.predict(X_test)

In [45]:
accuracy_svc = lsvc.score(X_train,Y_train)
accuracy_svc


Out[45]:
0.90684624017957349

In [47]:
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC, LinearSVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import Perceptron 
from sklearn.linear_model import SGDClassifier 
from sklearn.tree import DecisionTreeClassifier

In [48]:
knn = KNeighborsClassifier(n_neighbors = 3) 
knn.fit(X_train, Y_train) 
Y_pred = knn.predict(X_test) 
acc_knn = round(knn.score(X_train, Y_train) * 100, 2) 
acc_knn


Out[48]:
83.840000000000003

In [49]:
gaussian = GaussianNB() 
gaussian.fit(X_train, Y_train) 
Y_pred = gaussian.predict(X_test) 
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2) 
acc_gaussian


Out[49]:
77.780000000000001

In [50]:
perceptron = Perceptron() 
perceptron.fit(X_train, Y_train) 
Y_pred = perceptron.predict(X_test) 
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2) 
acc_perceptron


Out[50]:
65.319999999999993

In [51]:
random_forest = RandomForestClassifier(n_estimators=100) 
random_forest.fit(X_train, Y_train) 
Y_pred = random_forest.predict(X_test) 
random_forest.score(X_train, Y_train) 
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2) 
acc_random_forest


Out[51]:
97.980000000000004