Titanic


In [18]:
%matplotlib inline
from pylab import *
from matplotlib import pyplot as plt
import csv as csv
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier


def load_csv(file_name='train.csv'):
	csv_file_object = csv.reader(open(file_name, 'rb')) #Load in the csv file
	header = csv_file_object.next() #Skip the fist line as it is a header
	the_data=[] #Creat a variable called 'data'
	the_ids=[]
	for row in csv_file_object: #Skip through each row in the csv file
			the_data.append(row[:]) #adding each row to the data variable
	return np.array(the_data)

def plot_xy(survived, died, x_index, y_index, y_label, x_label):
    survived[survived == '']=-1.
    died[died == '']=-1.
    x = np.asarray(survived[:, x_index], dtype=float32)
    y = np.asarray(survived[:, y_index], dtype=float32)
    plt.plot(np.asarray(died[:, x_index], dtype=float32), np.asarray(died[:, y_index], dtype=float32), 'bo')
    plt.plot(x, y, 'rx')
    plt.ylabel(y_label)
    plt.xlabel(x_label)
    plt.autoscale(tight=False)
    plt.show()

def plot_survivor_death_bar(index, survivors, deaths):
    plt.bar(index, survivors.shape[0], label="Survivors")
    plt.bar(index, deaths.shape[0], bottom=survivors.shape[0], color='r')
    

train_data = np.asarray(load_csv('src/train.csv'))

survived = train_data[train_data[:,1]=='1']
died = train_data[train_data[:, 1]=='0']

#plotting dotted charts
plot_xy(survived, died, 5, 6, 'Siblings/Spouses', 'Idade')
plot_xy(survived, died, 5, 7, 'Parents/Children', 'Idade')

#plotting bar charts
plot_survivor_death_bar(0, survived, died)
plot_survivor_death_bar(1, survived[survived[:, 4]=='female'], died[died[:, 4]=='female'])
plot_survivor_death_bar(2, survived[survived[:, 4]=='male'], died[died[:, 4]=='male'])
plot_survivor_death_bar(3, survived[survived[:, 4]==None], died[died[:, 4]==None])
plt.xticks([0.4, 1.4, 2.4, 3.4], ('All', 'Female', 'Male', 'Unknow'))
plt.ylabel('# people')
plt.xlabel('Sex')
plt.show()

plot_survivor_death_bar(0, survived[survived[:, 2]=='1'], died[died[:, 2]=='1'])
plot_survivor_death_bar(1, survived[survived[:, 2]=='2'], died[died[:, 2]=='2'])
plot_survivor_death_bar(2, survived[survived[:, 2]=='3'], died[died[:, 2]=='3'])
plot_survivor_death_bar(3, survived[survived[:, 2]==None], died[died[:, 2]==None])
plt.xticks([0.4, 1.4, 2.4, 3.4], ('First', 'Second', 'Third', 'Unknow'))
plt.ylabel('# people')
plt.xlabel('Class')
plt.show()


Port of Embark


In [19]:
plot_survivor_death_bar(0, survived[survived[:, 11]=='C'], died[died[:, 11]=='C'])
plot_survivor_death_bar(1, survived[survived[:, 11]=='Q'], died[died[:, 11]=='Q'])
plot_survivor_death_bar(2, survived[survived[:, 11]=='S'], died[died[:, 11]=='S'])
plot_survivor_death_bar(3, survived[survived[:, 11]==None], died[died[:, 11]==None])
plt.xticks([0.4, 1.4, 2.4, 3.4], ('Cherbourg', 'Queenstown', 'Southampton', 'Unknow'))
plt.ylabel('# people')
plt.xlabel('Port of Embarkation')
plt.show()


Super class to hipothesys. It defines common behavior and a common interface.


In [20]:
from sklearn import preprocessing
class Hypothesis:

  def __init__(self, train_data, validation_data):
    self.trainY = self.extract_y(train_data)
    self.trainX = self.extract_x(train_data)
    self.validationY = self.extract_y(validation_data)
    self.validationX = self.extract_x(validation_data)
    self.trainX = preprocessing.scale(self.trainX)
    self.validationX = preprocessing.scale(self.validationX)

  def extract_y(self, data):
    y = np.asarray(data[:, 1], dtype=np.float32)
    return np.reshape(y, -1)
  
  #0   PassengerId
  #1   Survived
  #2   pclass          Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)             
  #3   name            Name
  #4   sex             Sex
  #5   age             Age
  #6   sibsp           Number of Siblings/Spouses Aboard
  #7   parch           Number of Parents/Children Aboard
  #8   ticket          Ticket Number
  #9   fare            Passenger Fare
  #10  cabin           Cabin
  #11  embarked        Port of Embarkation
  def extract_x(self, data, columns_2_remove=[0, 1, 3, 8, 10]):
    relevant_features = np.delete(data, columns_2_remove, 1)
    relevant_features = self.convert_texts(relevant_features)
    relevant_features=np.asarray(relevant_features, dtype=np.float32)
    return relevant_features

  def convert_texts(self, data):
    data[data == 'male']=0.
    data[data == 'female']=1.
    data[data == '']=-1.
    data[data == 'C']=0.
    data[data == 'Q']=1.
    data[data == 'S']=2.
    return data

  def train(self):
    highest_precision = 0
    for local_penalty in ['l1', 'l2']:
      for local_tol in np.arange(0.01, 2.5, 0.05):
        for local_c in np.arange(0.5, 5.5, 0.5):
          for local_fit_intercept in [True, False]:
            for local_intercept_scaling in np.arange(0.1, 3, 0.5):
              for local_loss in ['l1', 'l2']:
                if self.valid_parameters(local_c, local_loss, local_penalty, local_tol, local_fit_intercept, local_intercept_scaling):
                  hipothesys = self.create_hipothesys(local_c, local_loss, local_penalty, local_tol, local_fit_intercept, local_intercept_scaling)
                  precision = self.calculatePrecision(hipothesys)
                  if precision > highest_precision:
                    print "Nova melhor precisao atual: {0}".format(precision)    
                    self.hipothesys = hipothesys
                    self.precision = precision
                    highest_precision = precision


  def score(self):
    return self.precision

  def calculatePrecision(self, hipothesys):
    hipothesys.fit(self.trainX, self.trainY)
    return hipothesys.score(self.validationX, self.validationY) 

  def predict(self, test_data):
    x = self.extract_x(test_data, [0, 2, 7, 9])
    predicted = self.hipothesys.predict(x)
    resultado = np.empty([test_data.shape[0], 2], dtype=int)
    resultado[:, 0] = test_data[:, 0]
    resultado[:, 1] = predicted.astype('int')
    return resultado

  def __str__(self):
    return "{0} with params {1}".format(self.hipothesys, self.hipothesys.get_params())

In [21]:
class LinearSVCHipotesys(Hypothesis):

	def valid_parameters(self, c, loss, penalty, tol, fit_intercept, intercept_scaling):
        #penalty='l2' and ploss='l1' is only supported when dual='true'
		if (penalty == 'l1' and loss == 'l1') or (penalty == 'l2' and loss == 'l1'):
			return False
		else:
			return True

	def create_hipothesys(self, c, loss, penalty, tol, fit_intercept, intercept_scaling):
		hipotesys = LinearSVC(C=c, loss=loss, penalty=penalty, tol=tol, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, dual=False, verbose=0)
		return hipotesys

In [22]:
class LogisticRegressionHipotesys(Hypothesis):

	def valid_parameters(self, c, loss, penalty, tol, fit_intercept, intercept_scaling):
		if penalty == 'l1' and loss == 'l1':
			return False
		else:
			return True

	def create_hipothesys(self, c, loss, penalty, tol, fit_intercept, intercept_scaling):
		hypothesis = LogisticRegression(C=c, penalty=penalty, tol=tol, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, dual=False)
		return hypothesis

In [23]:
class SVCHipotesys(Hypothesis):

	def train(self):
		highest_precision = 0
		for c in np.arange(0.1, 2.5, 0.1):
			hipothesys = SVC(C=c, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, 
                             probability=False, tol=0.0001, cache_size=200, class_weight=None, verbose=False, 
                             max_iter=-1, random_state=None)
			precision = self.calculatePrecision(hipothesys)
			if precision > highest_precision:
				self.hipothesys = hipothesys 
				self.precision = precision
				highest_precision = precision

In [ ]:
class RandomForestHipothesys(Hypothesis):
    def train(self):
        highest_precision = 0
        for n_estimator in np.arange(1, 20, 1):
            for criterion in ['gini', 'entropy']:
                hipothesys = RandomForestClassifier(n_estimators=n_estimator, criterion=criterion)
                precision = self.calculatePrecision(hipothesys)
                if precision > highest_precision:
                    precision = self.calculatePrecision(hipothesys)
                    self.hipothesys = hipothesys
                    self.precision = precision

In [ ]:
np.random.shuffle(train_data)
import math
validation_size = math.floor(train_data.shape[0] * 0.2)
validation_data = np.asarray(train_data[:validation_size, :])
train_data = np.asarray(train_data[validation_size:, :])

hipotesys_list = []
hipotesys_list.append(RandomForestHipothesys(train_data, validation_data))
hipotesys_list.append(LogisticRegressionHipotesys(train_data, validation_data))
hipotesys_list.append(LinearSVCHipotesys(train_data, validation_data))
hipotesys_list.append(SVCHipotesys(train_data, validation_data))
winner_hypothesis = None
highest_score = 0.0
for h in hipotesys_list:
	h.train()
	score = h.score()
	if score > highest_score:
		winner_hypothesis = h
		highest_score = score

print "======== Winner Hipothesys ============="
print winner_hypothesis
print "========================================"
print "Precision against validation data:"
print highest_score


Nova melhor precisao atual: 0.769662921348
Nova melhor precisao atual: 0.775280898876
Nova melhor precisao atual: 0.780898876404
Nova melhor precisao atual: 0.792134831461
Nova melhor precisao atual: 0.797752808989
Nova melhor precisao atual: 0.803370786517
Nova melhor precisao atual: 0.808988764045
Nova melhor precisao atual: 0.814606741573

In [ ]:
test_data = load_csv('src/test.csv')
final_result = winner_hypothesis.predict(test_data)
np.savetxt('final_result.csv', final_result.astype('int'), fmt="%.1d", delimiter=",")