In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import datasets, svm
from sklearn.preprocessing import Imputer
import math
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import accuracy_score


/usr/local/lib/python2.7/dist-packages/pytz/__init__.py:29: UserWarning: Module dap was already imported from None, but /usr/lib/python2.7/dist-packages is being added to sys.path
  from pkg_resources import resource_stream
/usr/local/lib/python2.7/dist-packages/pytz/__init__.py:29: UserWarning: Module PIL was already imported from /usr/local/lib/python2.7/dist-packages/PIL/__init__.pyc, but /usr/lib/python2.7/dist-packages is being added to sys.path
  from pkg_resources import resource_stream

In [2]:
ls


gendermodel.csv  test.csv   train.sublime-project    Untitled0.ipynb
gendermodel.py   train.csv  train.sublime-workspace

In [3]:
columns = ['Pclass','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']

In [4]:
columns = ['Pclass','Sex','Age','SibSp','Parch'];

In [5]:
#columns = ['Fare','Age']

In [7]:
train_df = pd.read_csv("train.csv")
#vectorize training data
train_df['Sex']=train_df["Sex"].apply(lambda sex: 0 if sex == "male" else 1)

#fills in the na's
#and int columns won't have NaNs else they would be upcast to float.
for col in train_df:
    dt = train_df[col].dtype
    
    #fill in Nana
    if dt== float:
        train_df[col].fillna(train_df[col].mean(), inplace=True)
    #normalize data
    if dt==float or dt==int:    
        train_df[col] -= train_df[col].max()/2
        train_df[col] /=max( train_df[col].max(),math.fabs(train_df[col].max()))

In [8]:
train_df.ix[:,columns].head()


/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
Out[8]:
Pclass Sex Age SibSp Parch
0 1 0 -0.450 -1 -1
1 0 1 -0.050 -1 -1
2 1 1 -0.350 -1 -1
3 0 1 -0.125 -1 -1
4 1 0 -0.125 -1 -1

In [22]:
test_df = pd.read_csv("test.csv")
#vectorize training data
test_df['Sex']=test_df["Sex"].apply(lambda sex: 0 if sex == "male" else 1)

#fills in the na's
#and int columns won't have NaNs else they would be upcast to float.
for col in test_df:
    dt = test_df[col].dtype
    if col == 'PassengerId':
        print col
    #fill in nan
    if dt== float:
        test_df[col].fillna(test_df[col].mean(), inplace=True)
    #normalize data
    if (dt==float or dt==int) and col != 'PassengerId':    
        test_df[col]-=test_df[col].max()/2
        test_df[col] /=max( test_df[col].max(),math.fabs(test_df[col].max()))


PassengerId

In [24]:
test_df.head()


/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
Out[24]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 1 Kelly, Mr. James 0 -0.092105 -1 -1 330911 -0.969437 NaN Q
1 893 1 Wilkes, Mrs. James (Ellen Needs) 1 0.236842 -1 -1 363272 -0.972674 NaN S
2 894 0 Myles, Mr. Thomas Francis 0 0.631579 -1 -1 240276 -0.962183 NaN Q
3 895 1 Wirz, Mr. Albert 0 -0.289474 -1 -1 315154 -0.966184 NaN S
4 896 1 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 -0.421053 -1 -1 3101298 -0.952033 NaN S

In [25]:
#create a list of the types of kerneks we will use for your analysis
types_of_kernels = ['linear', 'rbf', 'poly']


# specify our color map for plotting the results
color_map = plt.cm.RdBu_r

for fig_num, kernel in enumerate(types_of_kernels):
    
    
    # fit the model
    clf = svm.SVC(kernel=kernel, gamma=3)
    clf.fit(train_df.ix[:,columns],train_df['Survived'])

   
    plt.figure(fig_num)

    plt.scatter(train_df[columns[0]], train_df[columns[1]], c=train_df['Survived'], zorder=10, cmap=color_map)

    # Circle out the test data
    plt.scatter(test_df[columns[0]], test_df[columns[1]], s=80, facecolors='none', zorder=10)

    plt.axis('tight')
    x_min = train_df[columns[0]].min()
    x_max = train_df[columns[0]].max()
    y_min = train_df[columns[1]].min()
    y_max = train_df[columns[1]].max()

    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
    Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(XX.shape)
    plt.pcolormesh(XX, YY, Z > 0, cmap=color_map)
    plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'],
               levels=[-.5, 0, .5])
    
    plt.title(kernel)
    plt.show()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-25-d845cba9ed48> in <module>()
     28 
     29     XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
---> 30     Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
     31 
     32     # Put the result into a color plot

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in decision_function(self, X)
    359                                       " sparse SVM.")
    360 
--> 361         X = self._validate_for_predict(X)
    362         X = self._compute_kernel(X)
    363 

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in _validate_for_predict(self, X)
    402             raise ValueError("X.shape[1] = %d should be equal to %d, "
    403                              "the number of features at training time" %
--> 404                              (n_features, self.shape_fit_[1]))
    405         return X
    406 

ValueError: X.shape[1] = 2 should be equal to 5, the number of features at training time

In [12]:
minimum={"error":1,"model":""}
error_rate=0
types_of_kernels = ['linear', 'rbf', 'poly']
model=""
#create crossvalidation sets to test for accurancy
cv =ShuffleSplit(len(train_df.index), train_size=300,test_size=None)
for traincv, testcv in cv:
    for fig_num, kernel in enumerate(types_of_kernels):
        print kernel
        if kernel == 'linear':
            for i in range(1,10):
                # fit the model
                clf = svm.SVC(kernel=kernel, degree=i,gamma=3)
                model = clf.fit(train_df.ix[:,columns],train_df['Survived'])
                
                 #predict the test set
                y_pred = clf.predict(train_df.ix[testcv,columns])
                y_test = train_df.ix[testcv,'Survived']
                accuracy = accuracy_score(y_test, y_pred)
                error_rate =  1-accuracy
                if error_rate < minimum['error']:
                    minimum['error']=error_rate
                    minimum['model']=model
        if kernel == 'poly':
            for i in range(1,10):
                # fit the model
                clf = svm.SVC(kernel=kernel, degree=i,gamma=3)
                model = clf.fit(train_df.ix[:,columns],train_df['Survived'])
                
                 #predict the test set
                y_pred = clf.predict(train_df.ix[testcv,columns])
                y_test = train_df.ix[testcv,'Survived']
                accuracy = accuracy_score(y_test, y_pred)
                error_rate =  1-accuracy
                if error_rate < minimum['error']:
                    minimum['error']=error_rate
                    minimum['model']=model
        if kernel == 'rbf':
            for i in range(1,10):
                # fit the model
                clf = svm.SVC(kernel=kernel, degree=3,gamma=i)
                model = clf.fit(train_df.ix[:,columns],train_df['Survived'])
                
                 #predict the test set
                y_pred = clf.predict(train_df.ix[testcv,columns])
                y_test = train_df.ix[testcv,'Survived']
                accuracy = accuracy_score(y_test, y_pred)
                error_rate =  1-accuracy
                if error_rate < minimum['error']:
                    minimum['error']=error_rate
                    minimum['model']=model
        print minimum


linear
{'model': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=1, gamma=3,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False), 'error': 0.20812182741116747}
rbf
{'model': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=3,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False), 'error': 0.18443316412859556}
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-12-2a3c5263d9cc> in <module>()
     26                 # fit the model
     27                 clf = svm.SVC(kernel=kernel, degree=i,gamma=3)
---> 28                 model = clf.fit(train_df.ix[:,columns],train_df['Survived'])
     29 
     30                  #predict the test set

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in fit(self, X, y, sample_weight)
    176 
    177         seed = rnd.randint(np.iinfo('i').max)
--> 178         fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
    179         # see comment on the other call to np.iinfo in this file
    180 

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in _dense_fit(self, X, y, sample_weight, solver_type, kernel, random_seed)
    231                 cache_size=self.cache_size, coef0=self.coef0,
    232                 gamma=self._gamma, epsilon=self.epsilon,
--> 233                 max_iter=self.max_iter, random_seed=random_seed)
    234 
    235         self._warn_from_fit_status()

KeyboardInterrupt: 
poly

In [26]:
minimum


Out[26]:
{'error': 0.18274111675126903,
 'model': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=5, gamma=3,
  kernel='poly', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)}

In [176]:
import sklearn.ensemble
#create crossvalidation sets to test for accurancy
cv =ShuffleSplit(len(train_df.index), train_size=300,test_size=None)
for traincv, testcv in cv:
    # fit the model
    clf = sklearn.ensemble.ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
    #clf = svm.SVC(kernel=kernel, degree=3,gamma=3)
    clf.fit(train_df.ix[:,columns],train_df['Survived'])
    
     #predict the test set
    y_preds = clf.predict(train_df.ix[testcv,columns])
    y_test = train_df.ix[testcv,'Survived']
    accuracy = accuracy_score(y_test, y_pred)
    error_rate =  1-accuracy
    print '{} error rate {}'.format(kernel,error_rate)


poly error rate 0.424703891709
poly error rate 0.438240270728
poly error rate 0.46192893401
poly error rate 0.433164128596
poly error rate 0.473773265651
poly error rate 0.424703891709
poly error rate 0.46192893401
poly error rate 0.43654822335
poly error rate 0.44331641286
poly error rate 0.465313028765
poly error rate 0.426395939086
poly error rate 0.429780033841
poly error rate 0.465313028765
poly error rate 0.423011844332
poly error rate 0.483925549915
poly error rate 0.46192893401
poly error rate 0.458544839255
poly error rate 0.446700507614
poly error rate 0.419627749577
poly error rate 0.446700507614

In [30]:
clf = svm.SVC(kernel='rbf', degree=3,gamma=6)
clf.fit(train_df.ix[:,columns],train_df['Survived'])
y_pred = clf.predict(test_df.ix[:,columns])
test_df["Survived"] = pd.Series(y_pred)

In [31]:
test_df.ix[:,['PassengerId', 'Survived']].head()


/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
Out[31]:
PassengerId Survived
0 892 0
1 893 0
2 894 0
3 895 0
4 896 1

In [32]:
test_df.to_csv("foo.csv", cols=['PassengerId', 'Survived'], index=False)

In [ ]: