notebook.community

Edit and run



In [1]:

    
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import datasets, svm
from sklearn.preprocessing import Imputer
import math
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import accuracy_score









    



/usr/local/lib/python2.7/dist-packages/pytz/__init__.py:29: UserWarning: Module dap was already imported from None, but /usr/lib/python2.7/dist-packages is being added to sys.path
  from pkg_resources import resource_stream
/usr/local/lib/python2.7/dist-packages/pytz/__init__.py:29: UserWarning: Module PIL was already imported from /usr/local/lib/python2.7/dist-packages/PIL/__init__.pyc, but /usr/lib/python2.7/dist-packages is being added to sys.path
  from pkg_resources import resource_stream



In [2]:

    
ls









    



gendermodel.csv  test.csv   train.sublime-project    Untitled0.ipynb
gendermodel.py   train.csv  train.sublime-workspace



In [3]:

    
columns = ['Pclass','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']



In [4]:

    
columns = ['Pclass','Sex','Age','SibSp','Parch'];



In [5]:

    
#columns = ['Fare','Age']



In [7]:

    
train_df = pd.read_csv("train.csv")
#vectorize training data
train_df['Sex']=train_df["Sex"].apply(lambda sex: 0 if sex == "male" else 1)

#fills in the na's
#and int columns won't have NaNs else they would be upcast to float.
for col in train_df:
    dt = train_df[col].dtype
    
    #fill in Nana
    if dt== float:
        train_df[col].fillna(train_df[col].mean(), inplace=True)
    #normalize data
    if dt==float or dt==int:    
        train_df[col] -= train_df[col].max()/2
        train_df[col] /=max( train_df[col].max(),math.fabs(train_df[col].max()))



In [8]:

    
train_df.ix[:,columns].head()









    



/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)






    Out[8]:






  
    
      
      Pclass
      Sex
      Age
      SibSp
      Parch
    
  
  
    
      0
       1
       0
      -0.450
      -1
      -1
    
    
      1
       0
       1
      -0.050
      -1
      -1
    
    
      2
       1
       1
      -0.350
      -1
      -1
    
    
      3
       0
       1
      -0.125
      -1
      -1
    
    
      4
       1
       0
      -0.125
      -1
      -1



In [22]:

    
test_df = pd.read_csv("test.csv")
#vectorize training data
test_df['Sex']=test_df["Sex"].apply(lambda sex: 0 if sex == "male" else 1)

#fills in the na's
#and int columns won't have NaNs else they would be upcast to float.
for col in test_df:
    dt = test_df[col].dtype
    if col == 'PassengerId':
        print col
    #fill in nan
    if dt== float:
        test_df[col].fillna(test_df[col].mean(), inplace=True)
    #normalize data
    if (dt==float or dt==int) and col != 'PassengerId':    
        test_df[col]-=test_df[col].max()/2
        test_df[col] /=max( test_df[col].max(),math.fabs(test_df[col].max()))









    



PassengerId



In [24]:

    
test_df.head()









    



/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)






    Out[24]:






  
    
      
      PassengerId
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
       892
       1
                                   Kelly, Mr. James
       0
      -0.092105
      -1
      -1
        330911
      -0.969437
       NaN
       Q
    
    
      1
       893
       1
                   Wilkes, Mrs. James (Ellen Needs)
       1
       0.236842
      -1
      -1
        363272
      -0.972674
       NaN
       S
    
    
      2
       894
       0
                          Myles, Mr. Thomas Francis
       0
       0.631579
      -1
      -1
        240276
      -0.962183
       NaN
       Q
    
    
      3
       895
       1
                                   Wirz, Mr. Albert
       0
      -0.289474
      -1
      -1
        315154
      -0.966184
       NaN
       S
    
    
      4
       896
       1
       Hirvonen, Mrs. Alexander (Helga E Lindqvist)
       1
      -0.421053
      -1
      -1
       3101298
      -0.952033
       NaN
       S



In [25]:

    
#create a list of the types of kerneks we will use for your analysis
types_of_kernels = ['linear', 'rbf', 'poly']


# specify our color map for plotting the results
color_map = plt.cm.RdBu_r

for fig_num, kernel in enumerate(types_of_kernels):
    
    
    # fit the model
    clf = svm.SVC(kernel=kernel, gamma=3)
    clf.fit(train_df.ix[:,columns],train_df['Survived'])

   
    plt.figure(fig_num)

    plt.scatter(train_df[columns[0]], train_df[columns[1]], c=train_df['Survived'], zorder=10, cmap=color_map)

    # Circle out the test data
    plt.scatter(test_df[columns[0]], test_df[columns[1]], s=80, facecolors='none', zorder=10)

    plt.axis('tight')
    x_min = train_df[columns[0]].min()
    x_max = train_df[columns[0]].max()
    y_min = train_df[columns[1]].min()
    y_max = train_df[columns[1]].max()

    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
    Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(XX.shape)
    plt.pcolormesh(XX, YY, Z > 0, cmap=color_map)
    plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'],
               levels=[-.5, 0, .5])
    
    plt.title(kernel)
    plt.show()









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-25-d845cba9ed48> in <module>()
     28 
     29     XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
---> 30     Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
     31 
     32     # Put the result into a color plot

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in decision_function(self, X)
    359                                       " sparse SVM.")
    360 
--> 361         X = self._validate_for_predict(X)
    362         X = self._compute_kernel(X)
    363 

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in _validate_for_predict(self, X)
    402             raise ValueError("X.shape[1] = %d should be equal to %d, "
    403                              "the number of features at training time" %
--> 404                              (n_features, self.shape_fit_[1]))
    405         return X
    406 

ValueError: X.shape[1] = 2 should be equal to 5, the number of features at training time



In [12]:

    
minimum={"error":1,"model":""}
error_rate=0
types_of_kernels = ['linear', 'rbf', 'poly']
model=""
#create crossvalidation sets to test for accurancy
cv =ShuffleSplit(len(train_df.index), train_size=300,test_size=None)
for traincv, testcv in cv:
    for fig_num, kernel in enumerate(types_of_kernels):
        print kernel
        if kernel == 'linear':
            for i in range(1,10):
                # fit the model
                clf = svm.SVC(kernel=kernel, degree=i,gamma=3)
                model = clf.fit(train_df.ix[:,columns],train_df['Survived'])
                
                 #predict the test set
                y_pred = clf.predict(train_df.ix[testcv,columns])
                y_test = train_df.ix[testcv,'Survived']
                accuracy = accuracy_score(y_test, y_pred)
                error_rate =  1-accuracy
                if error_rate < minimum['error']:
                    minimum['error']=error_rate
                    minimum['model']=model
        if kernel == 'poly':
            for i in range(1,10):
                # fit the model
                clf = svm.SVC(kernel=kernel, degree=i,gamma=3)
                model = clf.fit(train_df.ix[:,columns],train_df['Survived'])
                
                 #predict the test set
                y_pred = clf.predict(train_df.ix[testcv,columns])
                y_test = train_df.ix[testcv,'Survived']
                accuracy = accuracy_score(y_test, y_pred)
                error_rate =  1-accuracy
                if error_rate < minimum['error']:
                    minimum['error']=error_rate
                    minimum['model']=model
        if kernel == 'rbf':
            for i in range(1,10):
                # fit the model
                clf = svm.SVC(kernel=kernel, degree=3,gamma=i)
                model = clf.fit(train_df.ix[:,columns],train_df['Survived'])
                
                 #predict the test set
                y_pred = clf.predict(train_df.ix[testcv,columns])
                y_test = train_df.ix[testcv,'Survived']
                accuracy = accuracy_score(y_test, y_pred)
                error_rate =  1-accuracy
                if error_rate < minimum['error']:
                    minimum['error']=error_rate
                    minimum['model']=model
        print minimum









    



linear
{'model': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=1, gamma=3,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False), 'error': 0.20812182741116747}
rbf
{'model': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=3,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False), 'error': 0.18443316412859556}





    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-12-2a3c5263d9cc> in <module>()
     26                 # fit the model
     27                 clf = svm.SVC(kernel=kernel, degree=i,gamma=3)
---> 28                 model = clf.fit(train_df.ix[:,columns],train_df['Survived'])
     29 
     30                  #predict the test set

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in fit(self, X, y, sample_weight)
    176 
    177         seed = rnd.randint(np.iinfo('i').max)
--> 178         fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
    179         # see comment on the other call to np.iinfo in this file
    180 

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in _dense_fit(self, X, y, sample_weight, solver_type, kernel, random_seed)
    231                 cache_size=self.cache_size, coef0=self.coef0,
    232                 gamma=self._gamma, epsilon=self.epsilon,
--> 233                 max_iter=self.max_iter, random_seed=random_seed)
    234 
    235         self._warn_from_fit_status()

KeyboardInterrupt: 





    



poly



In [26]:

    
minimum









    Out[26]:





{'error': 0.18274111675126903,
 'model': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=5, gamma=3,
  kernel='poly', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)}



In [176]:

    
import sklearn.ensemble
#create crossvalidation sets to test for accurancy
cv =ShuffleSplit(len(train_df.index), train_size=300,test_size=None)
for traincv, testcv in cv:
    # fit the model
    clf = sklearn.ensemble.ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
    #clf = svm.SVC(kernel=kernel, degree=3,gamma=3)
    clf.fit(train_df.ix[:,columns],train_df['Survived'])
    
     #predict the test set
    y_preds = clf.predict(train_df.ix[testcv,columns])
    y_test = train_df.ix[testcv,'Survived']
    accuracy = accuracy_score(y_test, y_pred)
    error_rate =  1-accuracy
    print '{} error rate {}'.format(kernel,error_rate)









    



poly error rate 0.424703891709
poly error rate 0.438240270728
poly error rate 0.46192893401
poly error rate 0.433164128596
poly error rate 0.473773265651
poly error rate 0.424703891709
poly error rate 0.46192893401
poly error rate 0.43654822335
poly error rate 0.44331641286
poly error rate 0.465313028765
poly error rate 0.426395939086
poly error rate 0.429780033841
poly error rate 0.465313028765
poly error rate 0.423011844332
poly error rate 0.483925549915
poly error rate 0.46192893401
poly error rate 0.458544839255
poly error rate 0.446700507614
poly error rate 0.419627749577
poly error rate 0.446700507614



In [30]:

    
clf = svm.SVC(kernel='rbf', degree=3,gamma=6)
clf.fit(train_df.ix[:,columns],train_df['Survived'])
y_pred = clf.predict(test_df.ix[:,columns])
test_df["Survived"] = pd.Series(y_pred)



In [31]:

    
test_df.ix[:,['PassengerId', 'Survived']].head()









    



/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated.

  warnings.warn(d.msg, DeprecationWarning)






    Out[31]:






  
    
      
      PassengerId
      Survived
    
  
  
    
      0
       892
       0
    
    
      1
       893
       0
    
    
      2
       894
       0
    
    
      3
       895
       0
    
    
      4
       896
       1



In [32]:

    
test_df.to_csv("foo.csv", cols=['PassengerId', 'Survived'], index=False)



In [ ]:

	Pclass	Sex	Age	SibSp	Parch
0	1	0	-0.450	-1	-1
1	0	1	-0.050	-1	-1
2	1	1	-0.350	-1	-1
3	0	1	-0.125	-1	-1
4	1	0	-0.125	-1	-1

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	1	Kelly, Mr. James	0	-0.092105	-1	-1	330911	-0.969437	NaN	Q
1	893	1	Wilkes, Mrs. James (Ellen Needs)	1	0.236842	-1	-1	363272	-0.972674	NaN	S
2	894	0	Myles, Mr. Thomas Francis	0	0.631579	-1	-1	240276	-0.962183	NaN	Q
3	895	1	Wirz, Mr. Albert	0	-0.289474	-1	-1	315154	-0.966184	NaN	S
4	896	1	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	1	-0.421053	-1	-1	3101298	-0.952033	NaN	S