In [47]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('../data/hw2data.csv', header = None)
df.head()


Out[2]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
3 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
4 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +

In [3]:
#change missing values represented by ? to NaN
df.replace('?', np.nan, inplace = True)
df[0].value_counts()


Out[3]:
b    468
a    210
dtype: int64

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
0     678 non-null object
1     678 non-null object
2     690 non-null float64
3     684 non-null object
4     684 non-null object
5     681 non-null object
6     681 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    677 non-null object
14    690 non-null int64
15    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB

In [6]:
#need to impute values for colums 0,1,3,4,5,6,13
#is there a way to auto calc the percents? I wish
df[0].value_counts() #a-210, b-468
df['0imp'] = df[0]
df['0imp'].fillna(np.random.choice(['a','b'], p = [.31, .69]), inplace = True)

#convert to float, make a copy
df[1] = df[1].astype(float)
df['1imp'] = df[1]
df['1imp'].mean() #31.568171091445429 
df['1imp'].std() #11.957862498270877
df['1imp'].fillna(np.random.normal(31.568171091445429, 11.957862498270877), inplace = True) 
df['1imp'][df['1imp'] < 0] = 0

df[3].value_counts()  #u-519, y-163, l-2
df['3imp'] = df[3]
df['3imp'].fillna(np.random.choice(['u','y','l'], p = [.758, .238, .004]), inplace = True)

df[4].value_counts()
df['4imp'] = df[4]
df['4imp'].fillna(np.random.choice(['g','p','gg'], p = [.758, .238, .004]), inplace = True)

#14 values! there has to be a faster way to do the probability
keyVals = df[5].value_counts()
keys = [x for x in keyVals.keys()]
vals = keyVals.tolist()
val_probability = map(lambda x: float(x)/sum(vals), vals)
df['5imp'] = df[5]
df['5imp'].fillna(np.random.choice(keys, p = val_probability), inplace = True)

keyVals = df[6].value_counts()
keys = [x for x in keyVals.keys()]
vals = keyVals.tolist()
val_probability = map(lambda x: float(x)/sum(vals), vals)
df['6imp'] = df[6]
df['6imp'].fillna(np.random.choice(keys, p = val_probability), inplace = True)

df['13imp'] = df[13].astype(float)
df['13imp'].mean() #184.01477104874445 
df['13imp'].std() #173.80676822523813 
df['13imp'].fillna(np.random.normal(184.01477104874445 , 173.80676822523813 ), inplace = True) 
df['13imp'][df['13imp'] < 0] = 0

df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 23 columns):
0        678 non-null object
1        678 non-null float64
2        690 non-null float64
3        684 non-null object
4        684 non-null object
5        681 non-null object
6        681 non-null object
7        690 non-null float64
8        690 non-null object
9        690 non-null object
10       690 non-null int64
11       690 non-null object
12       690 non-null object
13       677 non-null object
14       690 non-null int64
15       690 non-null object
0imp     690 non-null object
1imp     690 non-null float64
3imp     690 non-null object
4imp     690 non-null object
5imp     690 non-null object
6imp     690 non-null object
13imp    690 non-null float64
dtypes: float64(5), int64(2), object(16)
memory usage: 129.4+ KB

In [7]:
#change values to + -  boolean
d = {'+': True, '-': False}
df[15] = df[15].map(d)

Plot, get dummies, and transmute


In [8]:
#PLOT STUFF YO
#create a new dataframe with none of the none imputed columsn missing data
dfImputed = df.copy()
for x in [13,6,5,4,3,1,0]:
    dfImputed.drop(x, axis=1, inplace=True)
    
plotted = pd.scatter_matrix(dfImputed, figsize=(15, 15))



In [9]:
#convert to bool on 8, 9, 11, 
d = {'t': True, 'f': False}
dfImputed[8] = dfImputed[8].map(d)
dfImputed[9] = dfImputed[9].map(d)
dfImputed[11] = dfImputed[11].map(d)

In [11]:
#get dummies on 12, '0imp', '3imp','4imp','5imp','6imp',
#ugh some of these values repeat.  Gotta find em and remove em I suppose and convert them to something else

#arr = pd.Series(['b', 'a', 'u', 'y', 'l', 't', 'g', 'p', 'gg', 'c', 'd', 'cc', 'i', 'j', 'k', 'm', 'r', 'q', 'w', 'x', 'e', 'aa', 'ff', 'v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o', 'g', 'p', 's'])
#arr.value_counts() # j, ff, p and g are the repeat offenders

#j -> jj
dfImputed['6imp'][dfImputed['6imp'] == 'j'] = 'jj'

#ff -> 'fff'
dfImputed['6imp'][dfImputed['6imp'] == 'ff'] = 'fff'

#p -> pp
dfImputed[12][dfImputed[12] == 'p'] = 'pp'

#g -> ggg
dfImputed[12][dfImputed[12] == 'g'] = 'ggg'

In [12]:
for column in [12, '0imp', '3imp','4imp','5imp','6imp']:
    dummies = pd.get_dummies(dfImputed[column])
    dfImputed[dummies.columns] = dummies
    
for x in [12, '0imp', '3imp','4imp','5imp','6imp']:
    dfImputed.drop(x, axis=1, inplace=True)
dfImputed.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 44 columns):
2        690 non-null float64
7        690 non-null float64
8        690 non-null bool
9        690 non-null bool
10       690 non-null int64
11       690 non-null bool
14       690 non-null int64
15       690 non-null bool
1imp     690 non-null float64
13imp    690 non-null float64
ggg      690 non-null float64
pp       690 non-null float64
s        690 non-null float64
a        690 non-null float64
b        690 non-null float64
l        690 non-null float64
u        690 non-null float64
y        690 non-null float64
g        690 non-null float64
gg       690 non-null float64
p        690 non-null float64
aa       690 non-null float64
c        690 non-null float64
cc       690 non-null float64
d        690 non-null float64
e        690 non-null float64
ff       690 non-null float64
i        690 non-null float64
j        690 non-null float64
k        690 non-null float64
m        690 non-null float64
q        690 non-null float64
r        690 non-null float64
w        690 non-null float64
x        690 non-null float64
bb       690 non-null float64
dd       690 non-null float64
fff      690 non-null float64
h        690 non-null float64
jj       690 non-null float64
n        690 non-null float64
o        690 non-null float64
v        690 non-null float64
z        690 non-null float64
dtypes: bool(4), float64(38), int64(2)
memory usage: 223.7 KB

In [13]:
#this is slow but i think it is right, commenting it out because it takes a heck of a long time to run
#plotted = pd.scatter_matrix(dfImputed, figsize=(75,75))

Logistic regression!


In [20]:
#lets use all the values for our model!
keys = list(dfImputed.keys())
keys.remove(15) #removes the y
x_data = keys

In [21]:
x_train, x_test, y_train, y_test = train_test_split(dfImputed[x_data], dfImputed[15])

In [22]:
logReg = LogisticRegression()

In [23]:
logReg.fit(x_train, y_train)


Out[23]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [24]:
logReg.score(x_test, y_test)


Out[24]:
0.87283236994219648

Make an SVM


In [26]:
#a linear one!
linSVC = LinearSVC(C = 1e-3)
linSVC.fit(x_train, y_train)
linSVC.score(x_test, y_test)


Out[26]:
0.83236994219653182

In [27]:
#using SVC!
#linear again! but different!

In [33]:
nlk = SVC()
nlk.fit(x_train, y_train)


Out[33]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [34]:
nlk.score(x_test,y_test)


Out[34]:
0.54913294797687862

grid search it


In [35]:
d = {}
d['C'] = np.logspace(-.3, 3., 100)
gs = GridSearchCV(LinearSVC(), d)

In [36]:
gs.fit(x_train, y_train)


Out[36]:
GridSearchCV(cv=None,
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  5.01187e-01,   5.41170e-01, ...,   9.26119e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [37]:
gs.best_estimator_


Out[37]:
LinearSVC(C=0.54116952654646366, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)

In [38]:
gs.best_estimator_.score(x_test, y_test)


Out[38]:
0.82658959537572252

In [39]:
#narrow the range for linear

d['C'] = np.logspace(2., 2.5, 20)
gs = GridSearchCV(LinearSVC(), d)

In [40]:
gs.fit(x_train, y_train)
gs.best_estimator_


Out[40]:
LinearSVC(C=162.37767391887209, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)

In [41]:
gs.best_estimator_.score(x_test, y_test)


Out[41]:
0.83815028901734101

In [42]:
#well that is all kinda not great...
#lets try the rbf radial basis function
d = {}
d['C'] = np.logspace(-.3, 3., 100)
gs = GridSearchCV(SVC(), d)
gs.fit(x_train, y_train)
gs.best_estimator_


Out[42]:
SVC(C=0.58434141337351764, cache_size=200, class_weight=None, coef0=0.0,
  degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [43]:
gs.best_estimator_.score(x_test, y_test)


Out[43]:
0.53757225433526012

In [44]:
d['C'] = np.logspace(-.3, 3., 10)
d['gamma'] = np.logspace(-.3, 3., 10)
gs = GridSearchCV(SVC(), d)
gs.fit(x_train, y_train)
gs.best_estimator_


Out[44]:
SVC(C=0.50118723362727224, cache_size=200, class_weight=None, coef0=0.0,
  degree=3, gamma=0.50118723362727224, kernel='rbf', max_iter=-1,
  probability=False, random_state=None, shrinking=True, tol=0.001,
  verbose=False)

In [45]:
gs.best_estimator_.score(x_test, y_test)


Out[45]:
0.52023121387283233

Confusion matrix!


In [49]:
#we had multiple predictions to map with a matrix
#logReg
#linSVC
#nlk
#gs
y_pred = logReg.predict(x_test)
confusion_matrix(y_test, y_pred)


Out[49]:
array([[77, 13],
       [ 9, 74]])

In [50]:
y_pred = linSVC.predict(x_test)
confusion_matrix(y_test, y_pred)


Out[50]:
array([[81,  9],
       [20, 63]])

In [51]:
y_pred = nlk.predict(x_test)
confusion_matrix(y_test, y_pred)


Out[51]:
array([[86,  4],
       [74,  9]])

In [52]:
y_pred = gs.predict(x_test)
confusion_matrix(y_test, y_pred)


Out[52]:
array([[90,  0],
       [83,  0]])