In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import datasets, svm
from sklearn.preprocessing import Imputer
import math
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import accuracy_score
In [2]:
ls
In [3]:
columns = ['Pclass','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
In [4]:
columns = ['Pclass','Sex','Age','SibSp','Parch'];
In [5]:
#columns = ['Fare','Age']
In [7]:
train_df = pd.read_csv("train.csv")
#vectorize training data
train_df['Sex']=train_df["Sex"].apply(lambda sex: 0 if sex == "male" else 1)
#fills in the na's
#and int columns won't have NaNs else they would be upcast to float.
for col in train_df:
dt = train_df[col].dtype
#fill in Nana
if dt== float:
train_df[col].fillna(train_df[col].mean(), inplace=True)
#normalize data
if dt==float or dt==int:
train_df[col] -= train_df[col].max()/2
train_df[col] /=max( train_df[col].max(),math.fabs(train_df[col].max()))
In [8]:
train_df.ix[:,columns].head()
Out[8]:
In [22]:
test_df = pd.read_csv("test.csv")
#vectorize training data
test_df['Sex']=test_df["Sex"].apply(lambda sex: 0 if sex == "male" else 1)
#fills in the na's
#and int columns won't have NaNs else they would be upcast to float.
for col in test_df:
dt = test_df[col].dtype
if col == 'PassengerId':
print col
#fill in nan
if dt== float:
test_df[col].fillna(test_df[col].mean(), inplace=True)
#normalize data
if (dt==float or dt==int) and col != 'PassengerId':
test_df[col]-=test_df[col].max()/2
test_df[col] /=max( test_df[col].max(),math.fabs(test_df[col].max()))
In [24]:
test_df.head()
Out[24]:
In [25]:
#create a list of the types of kerneks we will use for your analysis
types_of_kernels = ['linear', 'rbf', 'poly']
# specify our color map for plotting the results
color_map = plt.cm.RdBu_r
for fig_num, kernel in enumerate(types_of_kernels):
# fit the model
clf = svm.SVC(kernel=kernel, gamma=3)
clf.fit(train_df.ix[:,columns],train_df['Survived'])
plt.figure(fig_num)
plt.scatter(train_df[columns[0]], train_df[columns[1]], c=train_df['Survived'], zorder=10, cmap=color_map)
# Circle out the test data
plt.scatter(test_df[columns[0]], test_df[columns[1]], s=80, facecolors='none', zorder=10)
plt.axis('tight')
x_min = train_df[columns[0]].min()
x_max = train_df[columns[0]].max()
y_min = train_df[columns[1]].min()
y_max = train_df[columns[1]].max()
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
# Put the result into a color plot
Z = Z.reshape(XX.shape)
plt.pcolormesh(XX, YY, Z > 0, cmap=color_map)
plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'],
levels=[-.5, 0, .5])
plt.title(kernel)
plt.show()
In [12]:
minimum={"error":1,"model":""}
error_rate=0
types_of_kernels = ['linear', 'rbf', 'poly']
model=""
#create crossvalidation sets to test for accurancy
cv =ShuffleSplit(len(train_df.index), train_size=300,test_size=None)
for traincv, testcv in cv:
for fig_num, kernel in enumerate(types_of_kernels):
print kernel
if kernel == 'linear':
for i in range(1,10):
# fit the model
clf = svm.SVC(kernel=kernel, degree=i,gamma=3)
model = clf.fit(train_df.ix[:,columns],train_df['Survived'])
#predict the test set
y_pred = clf.predict(train_df.ix[testcv,columns])
y_test = train_df.ix[testcv,'Survived']
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1-accuracy
if error_rate < minimum['error']:
minimum['error']=error_rate
minimum['model']=model
if kernel == 'poly':
for i in range(1,10):
# fit the model
clf = svm.SVC(kernel=kernel, degree=i,gamma=3)
model = clf.fit(train_df.ix[:,columns],train_df['Survived'])
#predict the test set
y_pred = clf.predict(train_df.ix[testcv,columns])
y_test = train_df.ix[testcv,'Survived']
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1-accuracy
if error_rate < minimum['error']:
minimum['error']=error_rate
minimum['model']=model
if kernel == 'rbf':
for i in range(1,10):
# fit the model
clf = svm.SVC(kernel=kernel, degree=3,gamma=i)
model = clf.fit(train_df.ix[:,columns],train_df['Survived'])
#predict the test set
y_pred = clf.predict(train_df.ix[testcv,columns])
y_test = train_df.ix[testcv,'Survived']
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1-accuracy
if error_rate < minimum['error']:
minimum['error']=error_rate
minimum['model']=model
print minimum
In [26]:
minimum
Out[26]:
In [176]:
import sklearn.ensemble
#create crossvalidation sets to test for accurancy
cv =ShuffleSplit(len(train_df.index), train_size=300,test_size=None)
for traincv, testcv in cv:
# fit the model
clf = sklearn.ensemble.ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
#clf = svm.SVC(kernel=kernel, degree=3,gamma=3)
clf.fit(train_df.ix[:,columns],train_df['Survived'])
#predict the test set
y_preds = clf.predict(train_df.ix[testcv,columns])
y_test = train_df.ix[testcv,'Survived']
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1-accuracy
print '{} error rate {}'.format(kernel,error_rate)
In [30]:
clf = svm.SVC(kernel='rbf', degree=3,gamma=6)
clf.fit(train_df.ix[:,columns],train_df['Survived'])
y_pred = clf.predict(test_df.ix[:,columns])
test_df["Survived"] = pd.Series(y_pred)
In [31]:
test_df.ix[:,['PassengerId', 'Survived']].head()
Out[31]:
In [32]:
test_df.to_csv("foo.csv", cols=['PassengerId', 'Survived'], index=False)
In [ ]: