In [18]:
%matplotlib inline
from pylab import *
from matplotlib import pyplot as plt
import csv as csv
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
def load_csv(file_name='train.csv'):
csv_file_object = csv.reader(open(file_name, 'rb')) #Load in the csv file
header = csv_file_object.next() #Skip the fist line as it is a header
the_data=[] #Creat a variable called 'data'
the_ids=[]
for row in csv_file_object: #Skip through each row in the csv file
the_data.append(row[:]) #adding each row to the data variable
return np.array(the_data)
def plot_xy(survived, died, x_index, y_index, y_label, x_label):
survived[survived == '']=-1.
died[died == '']=-1.
x = np.asarray(survived[:, x_index], dtype=float32)
y = np.asarray(survived[:, y_index], dtype=float32)
plt.plot(np.asarray(died[:, x_index], dtype=float32), np.asarray(died[:, y_index], dtype=float32), 'bo')
plt.plot(x, y, 'rx')
plt.ylabel(y_label)
plt.xlabel(x_label)
plt.autoscale(tight=False)
plt.show()
def plot_survivor_death_bar(index, survivors, deaths):
plt.bar(index, survivors.shape[0], label="Survivors")
plt.bar(index, deaths.shape[0], bottom=survivors.shape[0], color='r')
train_data = np.asarray(load_csv('src/train.csv'))
survived = train_data[train_data[:,1]=='1']
died = train_data[train_data[:, 1]=='0']
#plotting dotted charts
plot_xy(survived, died, 5, 6, 'Siblings/Spouses', 'Idade')
plot_xy(survived, died, 5, 7, 'Parents/Children', 'Idade')
#plotting bar charts
plot_survivor_death_bar(0, survived, died)
plot_survivor_death_bar(1, survived[survived[:, 4]=='female'], died[died[:, 4]=='female'])
plot_survivor_death_bar(2, survived[survived[:, 4]=='male'], died[died[:, 4]=='male'])
plot_survivor_death_bar(3, survived[survived[:, 4]==None], died[died[:, 4]==None])
plt.xticks([0.4, 1.4, 2.4, 3.4], ('All', 'Female', 'Male', 'Unknow'))
plt.ylabel('# people')
plt.xlabel('Sex')
plt.show()
plot_survivor_death_bar(0, survived[survived[:, 2]=='1'], died[died[:, 2]=='1'])
plot_survivor_death_bar(1, survived[survived[:, 2]=='2'], died[died[:, 2]=='2'])
plot_survivor_death_bar(2, survived[survived[:, 2]=='3'], died[died[:, 2]=='3'])
plot_survivor_death_bar(3, survived[survived[:, 2]==None], died[died[:, 2]==None])
plt.xticks([0.4, 1.4, 2.4, 3.4], ('First', 'Second', 'Third', 'Unknow'))
plt.ylabel('# people')
plt.xlabel('Class')
plt.show()
Port of Embark
In [19]:
plot_survivor_death_bar(0, survived[survived[:, 11]=='C'], died[died[:, 11]=='C'])
plot_survivor_death_bar(1, survived[survived[:, 11]=='Q'], died[died[:, 11]=='Q'])
plot_survivor_death_bar(2, survived[survived[:, 11]=='S'], died[died[:, 11]=='S'])
plot_survivor_death_bar(3, survived[survived[:, 11]==None], died[died[:, 11]==None])
plt.xticks([0.4, 1.4, 2.4, 3.4], ('Cherbourg', 'Queenstown', 'Southampton', 'Unknow'))
plt.ylabel('# people')
plt.xlabel('Port of Embarkation')
plt.show()
Super class to hipothesys. It defines common behavior and a common interface.
In [20]:
from sklearn import preprocessing
class Hypothesis:
def __init__(self, train_data, validation_data):
self.trainY = self.extract_y(train_data)
self.trainX = self.extract_x(train_data)
self.validationY = self.extract_y(validation_data)
self.validationX = self.extract_x(validation_data)
self.trainX = preprocessing.scale(self.trainX)
self.validationX = preprocessing.scale(self.validationX)
def extract_y(self, data):
y = np.asarray(data[:, 1], dtype=np.float32)
return np.reshape(y, -1)
#0 PassengerId
#1 Survived
#2 pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
#3 name Name
#4 sex Sex
#5 age Age
#6 sibsp Number of Siblings/Spouses Aboard
#7 parch Number of Parents/Children Aboard
#8 ticket Ticket Number
#9 fare Passenger Fare
#10 cabin Cabin
#11 embarked Port of Embarkation
def extract_x(self, data, columns_2_remove=[0, 1, 3, 8, 10]):
relevant_features = np.delete(data, columns_2_remove, 1)
relevant_features = self.convert_texts(relevant_features)
relevant_features=np.asarray(relevant_features, dtype=np.float32)
return relevant_features
def convert_texts(self, data):
data[data == 'male']=0.
data[data == 'female']=1.
data[data == '']=-1.
data[data == 'C']=0.
data[data == 'Q']=1.
data[data == 'S']=2.
return data
def train(self):
highest_precision = 0
for local_penalty in ['l1', 'l2']:
for local_tol in np.arange(0.01, 2.5, 0.05):
for local_c in np.arange(0.5, 5.5, 0.5):
for local_fit_intercept in [True, False]:
for local_intercept_scaling in np.arange(0.1, 3, 0.5):
for local_loss in ['l1', 'l2']:
if self.valid_parameters(local_c, local_loss, local_penalty, local_tol, local_fit_intercept, local_intercept_scaling):
hipothesys = self.create_hipothesys(local_c, local_loss, local_penalty, local_tol, local_fit_intercept, local_intercept_scaling)
precision = self.calculatePrecision(hipothesys)
if precision > highest_precision:
print "Nova melhor precisao atual: {0}".format(precision)
self.hipothesys = hipothesys
self.precision = precision
highest_precision = precision
def score(self):
return self.precision
def calculatePrecision(self, hipothesys):
hipothesys.fit(self.trainX, self.trainY)
return hipothesys.score(self.validationX, self.validationY)
def predict(self, test_data):
x = self.extract_x(test_data, [0, 2, 7, 9])
predicted = self.hipothesys.predict(x)
resultado = np.empty([test_data.shape[0], 2], dtype=int)
resultado[:, 0] = test_data[:, 0]
resultado[:, 1] = predicted.astype('int')
return resultado
def __str__(self):
return "{0} with params {1}".format(self.hipothesys, self.hipothesys.get_params())
In [21]:
class LinearSVCHipotesys(Hypothesis):
def valid_parameters(self, c, loss, penalty, tol, fit_intercept, intercept_scaling):
#penalty='l2' and ploss='l1' is only supported when dual='true'
if (penalty == 'l1' and loss == 'l1') or (penalty == 'l2' and loss == 'l1'):
return False
else:
return True
def create_hipothesys(self, c, loss, penalty, tol, fit_intercept, intercept_scaling):
hipotesys = LinearSVC(C=c, loss=loss, penalty=penalty, tol=tol, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, dual=False, verbose=0)
return hipotesys
In [22]:
class LogisticRegressionHipotesys(Hypothesis):
def valid_parameters(self, c, loss, penalty, tol, fit_intercept, intercept_scaling):
if penalty == 'l1' and loss == 'l1':
return False
else:
return True
def create_hipothesys(self, c, loss, penalty, tol, fit_intercept, intercept_scaling):
hypothesis = LogisticRegression(C=c, penalty=penalty, tol=tol, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, dual=False)
return hypothesis
In [23]:
class SVCHipotesys(Hypothesis):
def train(self):
highest_precision = 0
for c in np.arange(0.1, 2.5, 0.1):
hipothesys = SVC(C=c, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True,
probability=False, tol=0.0001, cache_size=200, class_weight=None, verbose=False,
max_iter=-1, random_state=None)
precision = self.calculatePrecision(hipothesys)
if precision > highest_precision:
self.hipothesys = hipothesys
self.precision = precision
highest_precision = precision
In [ ]:
class RandomForestHipothesys(Hypothesis):
def train(self):
highest_precision = 0
for n_estimator in np.arange(1, 20, 1):
for criterion in ['gini', 'entropy']:
hipothesys = RandomForestClassifier(n_estimators=n_estimator, criterion=criterion)
precision = self.calculatePrecision(hipothesys)
if precision > highest_precision:
precision = self.calculatePrecision(hipothesys)
self.hipothesys = hipothesys
self.precision = precision
In [ ]:
np.random.shuffle(train_data)
import math
validation_size = math.floor(train_data.shape[0] * 0.2)
validation_data = np.asarray(train_data[:validation_size, :])
train_data = np.asarray(train_data[validation_size:, :])
hipotesys_list = []
hipotesys_list.append(RandomForestHipothesys(train_data, validation_data))
hipotesys_list.append(LogisticRegressionHipotesys(train_data, validation_data))
hipotesys_list.append(LinearSVCHipotesys(train_data, validation_data))
hipotesys_list.append(SVCHipotesys(train_data, validation_data))
winner_hypothesis = None
highest_score = 0.0
for h in hipotesys_list:
h.train()
score = h.score()
if score > highest_score:
winner_hypothesis = h
highest_score = score
print "======== Winner Hipothesys ============="
print winner_hypothesis
print "========================================"
print "Precision against validation data:"
print highest_score
In [ ]:
test_data = load_csv('src/test.csv')
final_result = winner_hypothesis.predict(test_data)
np.savetxt('final_result.csv', final_result.astype('int'), fmt="%.1d", delimiter=",")