In [26]:
from sklearn import svm
from sklearn import cross_validation
from sklearn import preprocessing
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('../data/DSL-StrongPasswordData.csv', sep = ',')

df = df.drop(['sessionIndex','rep'],1)

Y = df['subject'].apply(lambda x: int(x[-2:]))

X = df.drop('subject',1)
X = pd.DataFrame(preprocessing.scale(X))

X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=42)

In [15]:
def score(clf,X_test,Y_test):
    
    count = 0
    score = 0

    for i in X_test.index.values:
        y_hat = clf.predict(X_test.ix[i].reshape(1,-1))
        if y_hat == Y_test[i]:
            score += 1
        count += 1
    
    return round(score/count*100,2)

In [16]:
clf = svm.SVC()
clf.fit(X_train,Y_train)

print('This initial SVM classifier is ' + str(score(clf,X_test,Y_test)) + '% accurate')


This initial SVM classifier is 88.14% accurate

In [17]:
results = []

for i in [1,25,50,75,100]:
    print(i)
    results.append(score(svm.SVC(C = i).fit(X_train,Y_train),X_test,Y_test))
    
results


1
25
50
75
100
Out[17]:
[88.14, 90.12, 90.0, 89.9, 89.95]

In [18]:
# test different kernels

for i in ['linear','rbf','poly','sigmoid']:
    print(i + ': ' + str(score(svm.SVC(kernel = i).fit(X_train,Y_train),X_test,Y_test)) + '%')


linear: 84.85%
rbf: 88.14%
poly: 73.73%
sigmoid: 1.32%

In [19]:
def score_complex(clf,X_test,Y_test):
    
    count = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    
    user = Y_test.value_counts()[1]
    imposter = Y_test.value_counts()[0]

    for i in X_test.index.values:
        
        y_hat = clf.predict(X_test.ix[i].reshape(1,-1))
        
        if Y_test[i] == 1 and y_hat == 1:
            tp += 1
        if Y_test[i] == 1 and y_hat == 0:
            fn += 1
        if Y_test[i] == 0 and y_hat == 1:
            fp += 1
        if Y_test[i] == 0 and y_hat == 0:
            tn += 1
        
        count += 1
    
    return (round(tp/user*100,2),round(tn/imposter*100,2),round(fp/imposter*100,2),round(fn/user*100,2))

In [20]:
values = Y.values

true_positives = []
false_negatives = []
true_negatives = []
false_negatives = []

for i in [x for x in range(2,58) if x in values]:

    Y = df['subject'].apply(lambda x: int(x[-2:])).apply(lambda x: 1 if x == i else 0)

    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=42)

    clf = svm.SVC()
    clf.fit(X_train,Y_train)

    scores = score_complex(clf,X_test,Y_test)
    
    true_positives.append(scores[0])
    false_negatives.append(scores[3])
    true_negatives.append(scores[1])
    false_negatives.append(scores[2])

    print('SUBJECT' + str(i) + ': \n True-Positive = ' + str(scores[0]) + '% \n False-Negative = ' + str(scores[3]) + '% \n True-Negative = ' + str(scores[1]) + '% \n False-Positive = ' + str(scores[2]) + '%')


SUBJECT2: 
 True-Positive = 51.39% 
 False-Negative = 48.61% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT3: 
 True-Positive = 59.42% 
 False-Negative = 40.58% 
 True-Negative = 99.93% 
 False-Positive = 0.07%
SUBJECT4: 
 True-Positive = 67.12% 
 False-Negative = 32.88% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT5: 
 True-Positive = 83.56% 
 False-Negative = 16.44% 
 True-Negative = 99.93% 
 False-Positive = 0.07%
SUBJECT7: 
 True-Positive = 42.59% 
 False-Negative = 57.41% 
 True-Negative = 99.98% 
 False-Positive = 0.02%
SUBJECT8: 
 True-Positive = 49.43% 
 False-Negative = 50.57% 
 True-Negative = 99.97% 
 False-Positive = 0.03%
SUBJECT10: 
 True-Positive = 84.54% 
 False-Negative = 15.46% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT11: 
 True-Positive = 78.67% 
 False-Negative = 21.33% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT12: 
 True-Positive = 75.27% 
 False-Negative = 24.73% 
 True-Negative = 99.97% 
 False-Positive = 0.03%
SUBJECT13: 
 True-Positive = 77.78% 
 False-Negative = 22.22% 
 True-Negative = 99.93% 
 False-Positive = 0.07%
SUBJECT15: 
 True-Positive = 65.52% 
 False-Negative = 34.48% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT16: 
 True-Positive = 88.24% 
 False-Negative = 11.76% 
 True-Negative = 99.97% 
 False-Positive = 0.03%
SUBJECT17: 
 True-Positive = 87.8% 
 False-Negative = 12.2% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT18: 
 True-Positive = 75.34% 
 False-Negative = 24.66% 
 True-Negative = 99.98% 
 False-Positive = 0.02%
SUBJECT19: 
 True-Positive = 88.75% 
 False-Negative = 11.25% 
 True-Negative = 99.98% 
 False-Positive = 0.02%
SUBJECT20: 
 True-Positive = 56.98% 
 False-Negative = 43.02% 
 True-Negative = 99.85% 
 False-Positive = 0.15%
SUBJECT21: 
 True-Positive = 61.63% 
 False-Negative = 38.37% 
 True-Negative = 99.82% 
 False-Positive = 0.18%
SUBJECT22: 
 True-Positive = 94.44% 
 False-Negative = 5.56% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT24: 
 True-Positive = 90.41% 
 False-Negative = 9.59% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT25: 
 True-Positive = 89.41% 
 False-Negative = 10.59% 
 True-Negative = 99.9% 
 False-Positive = 0.1%
SUBJECT26: 
 True-Positive = 54.84% 
 False-Negative = 45.16% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT27: 
 True-Positive = 81.93% 
 False-Negative = 18.07% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT28: 
 True-Positive = 87.06% 
 False-Negative = 12.94% 
 True-Negative = 99.97% 
 False-Positive = 0.03%
SUBJECT29: 
 True-Positive = 73.68% 
 False-Negative = 26.32% 
 True-Negative = 99.98% 
 False-Positive = 0.02%
SUBJECT30: 
 True-Positive = 85.06% 
 False-Negative = 14.94% 
 True-Negative = 99.9% 
 False-Positive = 0.1%
SUBJECT31: 
 True-Positive = 31.87% 
 False-Negative = 68.13% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT32: 
 True-Positive = 14.29% 
 False-Negative = 85.71% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT33: 
 True-Positive = 91.3% 
 False-Negative = 8.7% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT34: 
 True-Positive = 70.15% 
 False-Negative = 29.85% 
 True-Negative = 99.98% 
 False-Positive = 0.02%
SUBJECT35: 
 True-Positive = 83.95% 
 False-Negative = 16.05% 
 True-Negative = 99.92% 
 False-Positive = 0.08%
SUBJECT36: 
 True-Positive = 91.25% 
 False-Negative = 8.75% 
 True-Negative = 99.98% 
 False-Positive = 0.02%
SUBJECT37: 
 True-Positive = 48.72% 
 False-Negative = 51.28% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT38: 
 True-Positive = 82.72% 
 False-Negative = 17.28% 
 True-Negative = 99.92% 
 False-Positive = 0.08%
SUBJECT39: 
 True-Positive = 81.82% 
 False-Negative = 18.18% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT40: 
 True-Positive = 80.0% 
 False-Negative = 20.0% 
 True-Negative = 99.97% 
 False-Positive = 0.03%
SUBJECT41: 
 True-Positive = 78.12% 
 False-Negative = 21.88% 
 True-Negative = 99.8% 
 False-Positive = 0.2%
SUBJECT42: 
 True-Positive = 93.67% 
 False-Negative = 6.33% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT43: 
 True-Positive = 97.4% 
 False-Negative = 2.6% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT44: 
 True-Positive = 91.36% 
 False-Negative = 8.64% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT46: 
 True-Positive = 64.04% 
 False-Negative = 35.96% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT47: 
 True-Positive = 69.01% 
 False-Negative = 30.99% 
 True-Negative = 99.83% 
 False-Positive = 0.17%
SUBJECT48: 
 True-Positive = 67.12% 
 False-Negative = 32.88% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT49: 
 True-Positive = 81.48% 
 False-Negative = 18.52% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT50: 
 True-Positive = 61.8% 
 False-Negative = 38.2% 
 True-Negative = 99.85% 
 False-Positive = 0.15%
SUBJECT51: 
 True-Positive = 70.67% 
 False-Negative = 29.33% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT52: 
 True-Positive = 93.18% 
 False-Negative = 6.82% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT53: 
 True-Positive = 93.06% 
 False-Negative = 6.94% 
 True-Negative = 99.98% 
 False-Positive = 0.02%
SUBJECT54: 
 True-Positive = 60.24% 
 False-Negative = 39.76% 
 True-Negative = 99.9% 
 False-Positive = 0.1%
SUBJECT55: 
 True-Positive = 93.85% 
 False-Negative = 6.15% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT56: 
 True-Positive = 66.67% 
 False-Negative = 33.33% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT57: 
 True-Positive = 63.44% 
 False-Negative = 36.56% 
 True-Negative = 99.92% 
 False-Positive = 0.08%

In [29]:
plt.figure()
sns.boxplot([true_positives, false_negatives])
plt.show()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-29-212e1fc5c546> in <module>()
      1 plt.figure()
----> 2 sns.boxplot([true_positives, false_negatives])
      3 plt.show()

/Users/justin/anaconda/lib/python3.5/site-packages/seaborn/categorical.py in boxplot(x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, fliersize, linewidth, whis, notch, ax, **kwargs)
   2133     kwargs.update(dict(whis=whis, notch=notch))
   2134 
-> 2135     plotter.plot(ax, kwargs)
   2136     return ax
   2137 

/Users/justin/anaconda/lib/python3.5/site-packages/seaborn/categorical.py in plot(self, ax, boxplot_kws)
    522     def plot(self, ax, boxplot_kws):
    523         """Make the plot."""
--> 524         self.draw_boxplot(ax, boxplot_kws)
    525         self.annotate_axes(ax)
    526         if self.orient == "h":

/Users/justin/anaconda/lib/python3.5/site-packages/seaborn/categorical.py in draw_boxplot(self, ax, kws)
    459                                          positions=[i],
    460                                          widths=self.width,
--> 461                                          **kws)
    462                 color = self.colors[i]
    463                 self.restyle_boxplot(artist_dict, color, props)

/Users/justin/anaconda/lib/python3.5/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1809                     warnings.warn(msg % (label_namer, func.__name__),
   1810                                   RuntimeWarning, stacklevel=2)
-> 1811             return func(ax, *args, **kwargs)
   1812         pre_doc = inner.__doc__
   1813         if pre_doc is None:

/Users/justin/anaconda/lib/python3.5/site-packages/matplotlib/axes/_axes.py in boxplot(self, x, notch, sym, vert, whis, positions, widths, patch_artist, bootstrap, usermedians, conf_intervals, meanline, showmeans, showcaps, showbox, showfliers, boxprops, labels, flierprops, medianprops, meanprops, capprops, whiskerprops, manage_xticks)
   3206             bootstrap = rcParams['boxplot.bootstrap']
   3207         bxpstats = cbook.boxplot_stats(x, whis=whis, bootstrap=bootstrap,
-> 3208                                        labels=labels)
   3209         if notch is None:
   3210             notch = rcParams['boxplot.notch']

/Users/justin/anaconda/lib/python3.5/site-packages/matplotlib/cbook.py in boxplot_stats(X, whis, bootstrap, labels)
   2009 
   2010         # arithmetic mean
-> 2011         stats['mean'] = np.mean(x)
   2012 
   2013         # medians and quartiles

/Users/justin/anaconda/lib/python3.5/site-packages/numpy/core/fromnumeric.py in mean(a, axis, dtype, out, keepdims)
   2872 
   2873     return _methods._mean(a, axis=axis, dtype=dtype,
-> 2874                           out=out, keepdims=keepdims)
   2875 
   2876 

/Users/justin/anaconda/lib/python3.5/site-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
     70         ret = ret.dtype.type(ret / rcount)
     71     else:
---> 72         ret = ret / rcount
     73 
     74     return ret

TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [ ]: