notebook.community

Edit and run



In [47]:

    
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report



In [2]:

    
df = pd.read_csv('../data/hw2data.csv', header = None)
df.head()



In [3]:

    
#change missing values represented by ? to NaN
df.replace('?', np.nan, inplace = True)
df[0].value_counts()









    Out[3]:





b    468
a    210
dtype: int64



In [4]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
0     678 non-null object
1     678 non-null object
2     690 non-null float64
3     684 non-null object
4     684 non-null object
5     681 non-null object
6     681 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    677 non-null object
14    690 non-null int64
15    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB



In [6]:

    
#need to impute values for colums 0,1,3,4,5,6,13
#is there a way to auto calc the percents? I wish
df[0].value_counts() #a-210, b-468
df['0imp'] = df[0]
df['0imp'].fillna(np.random.choice(['a','b'], p = [.31, .69]), inplace = True)

#convert to float, make a copy
df[1] = df[1].astype(float)
df['1imp'] = df[1]
df['1imp'].mean() #31.568171091445429 
df['1imp'].std() #11.957862498270877
df['1imp'].fillna(np.random.normal(31.568171091445429, 11.957862498270877), inplace = True) 
df['1imp'][df['1imp'] < 0] = 0

df[3].value_counts()  #u-519, y-163, l-2
df['3imp'] = df[3]
df['3imp'].fillna(np.random.choice(['u','y','l'], p = [.758, .238, .004]), inplace = True)

df[4].value_counts()
df['4imp'] = df[4]
df['4imp'].fillna(np.random.choice(['g','p','gg'], p = [.758, .238, .004]), inplace = True)

#14 values! there has to be a faster way to do the probability
keyVals = df[5].value_counts()
keys = [x for x in keyVals.keys()]
vals = keyVals.tolist()
val_probability = map(lambda x: float(x)/sum(vals), vals)
df['5imp'] = df[5]
df['5imp'].fillna(np.random.choice(keys, p = val_probability), inplace = True)

keyVals = df[6].value_counts()
keys = [x for x in keyVals.keys()]
vals = keyVals.tolist()
val_probability = map(lambda x: float(x)/sum(vals), vals)
df['6imp'] = df[6]
df['6imp'].fillna(np.random.choice(keys, p = val_probability), inplace = True)

df['13imp'] = df[13].astype(float)
df['13imp'].mean() #184.01477104874445 
df['13imp'].std() #173.80676822523813 
df['13imp'].fillna(np.random.normal(184.01477104874445 , 173.80676822523813 ), inplace = True) 
df['13imp'][df['13imp'] < 0] = 0

df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 23 columns):
0        678 non-null object
1        678 non-null float64
2        690 non-null float64
3        684 non-null object
4        684 non-null object
5        681 non-null object
6        681 non-null object
7        690 non-null float64
8        690 non-null object
9        690 non-null object
10       690 non-null int64
11       690 non-null object
12       690 non-null object
13       677 non-null object
14       690 non-null int64
15       690 non-null object
0imp     690 non-null object
1imp     690 non-null float64
3imp     690 non-null object
4imp     690 non-null object
5imp     690 non-null object
6imp     690 non-null object
13imp    690 non-null float64
dtypes: float64(5), int64(2), object(16)
memory usage: 129.4+ KB



In [7]:

    
#change values to + -  boolean
d = {'+': True, '-': False}
df[15] = df[15].map(d)

Plot, get dummies, and transmute



In [8]:

    
#PLOT STUFF YO
#create a new dataframe with none of the none imputed columsn missing data
dfImputed = df.copy()
for x in [13,6,5,4,3,1,0]:
    dfImputed.drop(x, axis=1, inplace=True)
    
plotted = pd.scatter_matrix(dfImputed, figsize=(15, 15))



In [9]:

    
#convert to bool on 8, 9, 11, 
d = {'t': True, 'f': False}
dfImputed[8] = dfImputed[8].map(d)
dfImputed[9] = dfImputed[9].map(d)
dfImputed[11] = dfImputed[11].map(d)



In [11]:

    
#get dummies on 12, '0imp', '3imp','4imp','5imp','6imp',
#ugh some of these values repeat.  Gotta find em and remove em I suppose and convert them to something else

#arr = pd.Series(['b', 'a', 'u', 'y', 'l', 't', 'g', 'p', 'gg', 'c', 'd', 'cc', 'i', 'j', 'k', 'm', 'r', 'q', 'w', 'x', 'e', 'aa', 'ff', 'v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o', 'g', 'p', 's'])
#arr.value_counts() # j, ff, p and g are the repeat offenders

#j -> jj
dfImputed['6imp'][dfImputed['6imp'] == 'j'] = 'jj'

#ff -> 'fff'
dfImputed['6imp'][dfImputed['6imp'] == 'ff'] = 'fff'

#p -> pp
dfImputed[12][dfImputed[12] == 'p'] = 'pp'

#g -> ggg
dfImputed[12][dfImputed[12] == 'g'] = 'ggg'



In [12]:

    
for column in [12, '0imp', '3imp','4imp','5imp','6imp']:
    dummies = pd.get_dummies(dfImputed[column])
    dfImputed[dummies.columns] = dummies
    
for x in [12, '0imp', '3imp','4imp','5imp','6imp']:
    dfImputed.drop(x, axis=1, inplace=True)
dfImputed.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 44 columns):
2        690 non-null float64
7        690 non-null float64
8        690 non-null bool
9        690 non-null bool
10       690 non-null int64
11       690 non-null bool
14       690 non-null int64
15       690 non-null bool
1imp     690 non-null float64
13imp    690 non-null float64
ggg      690 non-null float64
pp       690 non-null float64
s        690 non-null float64
a        690 non-null float64
b        690 non-null float64
l        690 non-null float64
u        690 non-null float64
y        690 non-null float64
g        690 non-null float64
gg       690 non-null float64
p        690 non-null float64
aa       690 non-null float64
c        690 non-null float64
cc       690 non-null float64
d        690 non-null float64
e        690 non-null float64
ff       690 non-null float64
i        690 non-null float64
j        690 non-null float64
k        690 non-null float64
m        690 non-null float64
q        690 non-null float64
r        690 non-null float64
w        690 non-null float64
x        690 non-null float64
bb       690 non-null float64
dd       690 non-null float64
fff      690 non-null float64
h        690 non-null float64
jj       690 non-null float64
n        690 non-null float64
o        690 non-null float64
v        690 non-null float64
z        690 non-null float64
dtypes: bool(4), float64(38), int64(2)
memory usage: 223.7 KB



In [13]:

    
#this is slow but i think it is right, commenting it out because it takes a heck of a long time to run
#plotted = pd.scatter_matrix(dfImputed, figsize=(75,75))

Logistic regression!



In [20]:

    
#lets use all the values for our model!
keys = list(dfImputed.keys())
keys.remove(15) #removes the y
x_data = keys



In [21]:

    
x_train, x_test, y_train, y_test = train_test_split(dfImputed[x_data], dfImputed[15])



In [22]:

    
logReg = LogisticRegression()



In [23]:

    
logReg.fit(x_train, y_train)









    Out[23]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)



In [24]:

    
logReg.score(x_test, y_test)









    Out[24]:





0.87283236994219648

Make an SVM



In [26]:

    
#a linear one!
linSVC = LinearSVC(C = 1e-3)
linSVC.fit(x_train, y_train)
linSVC.score(x_test, y_test)









    Out[26]:





0.83236994219653182



In [27]:

    
#using SVC!
#linear again! but different!



In [33]:

    
nlk = SVC()
nlk.fit(x_train, y_train)









    Out[33]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [34]:

    
nlk.score(x_test,y_test)









    Out[34]:





0.54913294797687862

grid search it



In [35]:

    
d = {}
d['C'] = np.logspace(-.3, 3., 100)
gs = GridSearchCV(LinearSVC(), d)



In [36]:

    
gs.fit(x_train, y_train)









    Out[36]:





GridSearchCV(cv=None,
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  5.01187e-01,   5.41170e-01, ...,   9.26119e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [37]:

    
gs.best_estimator_









    Out[37]:





LinearSVC(C=0.54116952654646366, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)



In [38]:

    
gs.best_estimator_.score(x_test, y_test)









    Out[38]:





0.82658959537572252



In [39]:

    
#narrow the range for linear

d['C'] = np.logspace(2., 2.5, 20)
gs = GridSearchCV(LinearSVC(), d)



In [40]:

    
gs.fit(x_train, y_train)
gs.best_estimator_









    Out[40]:





LinearSVC(C=162.37767391887209, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)



In [41]:

    
gs.best_estimator_.score(x_test, y_test)









    Out[41]:





0.83815028901734101



In [42]:

    
#well that is all kinda not great...
#lets try the rbf radial basis function
d = {}
d['C'] = np.logspace(-.3, 3., 100)
gs = GridSearchCV(SVC(), d)
gs.fit(x_train, y_train)
gs.best_estimator_









    Out[42]:





SVC(C=0.58434141337351764, cache_size=200, class_weight=None, coef0=0.0,
  degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)



In [43]:

    
gs.best_estimator_.score(x_test, y_test)









    Out[43]:





0.53757225433526012



In [44]:

    
d['C'] = np.logspace(-.3, 3., 10)
d['gamma'] = np.logspace(-.3, 3., 10)
gs = GridSearchCV(SVC(), d)
gs.fit(x_train, y_train)
gs.best_estimator_









    Out[44]:





SVC(C=0.50118723362727224, cache_size=200, class_weight=None, coef0=0.0,
  degree=3, gamma=0.50118723362727224, kernel='rbf', max_iter=-1,
  probability=False, random_state=None, shrinking=True, tol=0.001,
  verbose=False)



In [45]:

    
gs.best_estimator_.score(x_test, y_test)









    Out[45]:





0.52023121387283233

Confusion matrix!



In [49]:

    
#we had multiple predictions to map with a matrix
#logReg
#linSVC
#nlk
#gs
y_pred = logReg.predict(x_test)
confusion_matrix(y_test, y_pred)









    Out[49]:





array([[77, 13],
       [ 9, 74]])



In [50]:

    
y_pred = linSVC.predict(x_test)
confusion_matrix(y_test, y_pred)









    Out[50]:





array([[81,  9],
       [20, 63]])



In [51]:

    
y_pred = nlk.predict(x_test)
confusion_matrix(y_test, y_pred)









    Out[51]:





array([[86,  4],
       [74,  9]])



In [52]:

    
y_pred = gs.predict(x_test)
confusion_matrix(y_test, y_pred)









    Out[52]:





array([[90,  0],
       [83,  0]])

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
0	b	30.83	0.000	u	g	w	v	1.25	t	t	1	f	g	00202	0	+
1	a	58.67	4.460	u	g	q	h	3.04	t	t	6	f	g	00043	560	+
2	a	24.50	0.500	u	g	q	h	1.50	t	f	0	f	g	00280	824	+
3	b	27.83	1.540	u	g	w	v	3.75	t	t	5	t	g	00100	3	+
4	b	20.17	5.625	u	g	w	v	1.71	t	f	0	f	s	00120	0	+