In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
In [2]:
df = pd.read_csv('../data/hw2data.csv', header = None)
df.head()
Out[2]:
In [3]:
#change missing values represented by ? to NaN
df.replace('?', np.nan, inplace = True)
df[0].value_counts()
Out[3]:
In [4]:
df.info()
In [6]:
#need to impute values for colums 0,1,3,4,5,6,13
#is there a way to auto calc the percents? I wish
df[0].value_counts() #a-210, b-468
df['0imp'] = df[0]
df['0imp'].fillna(np.random.choice(['a','b'], p = [.31, .69]), inplace = True)
#convert to float, make a copy
df[1] = df[1].astype(float)
df['1imp'] = df[1]
df['1imp'].mean() #31.568171091445429
df['1imp'].std() #11.957862498270877
df['1imp'].fillna(np.random.normal(31.568171091445429, 11.957862498270877), inplace = True)
df['1imp'][df['1imp'] < 0] = 0
df[3].value_counts() #u-519, y-163, l-2
df['3imp'] = df[3]
df['3imp'].fillna(np.random.choice(['u','y','l'], p = [.758, .238, .004]), inplace = True)
df[4].value_counts()
df['4imp'] = df[4]
df['4imp'].fillna(np.random.choice(['g','p','gg'], p = [.758, .238, .004]), inplace = True)
#14 values! there has to be a faster way to do the probability
keyVals = df[5].value_counts()
keys = [x for x in keyVals.keys()]
vals = keyVals.tolist()
val_probability = map(lambda x: float(x)/sum(vals), vals)
df['5imp'] = df[5]
df['5imp'].fillna(np.random.choice(keys, p = val_probability), inplace = True)
keyVals = df[6].value_counts()
keys = [x for x in keyVals.keys()]
vals = keyVals.tolist()
val_probability = map(lambda x: float(x)/sum(vals), vals)
df['6imp'] = df[6]
df['6imp'].fillna(np.random.choice(keys, p = val_probability), inplace = True)
df['13imp'] = df[13].astype(float)
df['13imp'].mean() #184.01477104874445
df['13imp'].std() #173.80676822523813
df['13imp'].fillna(np.random.normal(184.01477104874445 , 173.80676822523813 ), inplace = True)
df['13imp'][df['13imp'] < 0] = 0
df.info()
In [7]:
#change values to + - boolean
d = {'+': True, '-': False}
df[15] = df[15].map(d)
In [8]:
#PLOT STUFF YO
#create a new dataframe with none of the none imputed columsn missing data
dfImputed = df.copy()
for x in [13,6,5,4,3,1,0]:
dfImputed.drop(x, axis=1, inplace=True)
plotted = pd.scatter_matrix(dfImputed, figsize=(15, 15))
In [9]:
#convert to bool on 8, 9, 11,
d = {'t': True, 'f': False}
dfImputed[8] = dfImputed[8].map(d)
dfImputed[9] = dfImputed[9].map(d)
dfImputed[11] = dfImputed[11].map(d)
In [11]:
#get dummies on 12, '0imp', '3imp','4imp','5imp','6imp',
#ugh some of these values repeat. Gotta find em and remove em I suppose and convert them to something else
#arr = pd.Series(['b', 'a', 'u', 'y', 'l', 't', 'g', 'p', 'gg', 'c', 'd', 'cc', 'i', 'j', 'k', 'm', 'r', 'q', 'w', 'x', 'e', 'aa', 'ff', 'v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o', 'g', 'p', 's'])
#arr.value_counts() # j, ff, p and g are the repeat offenders
#j -> jj
dfImputed['6imp'][dfImputed['6imp'] == 'j'] = 'jj'
#ff -> 'fff'
dfImputed['6imp'][dfImputed['6imp'] == 'ff'] = 'fff'
#p -> pp
dfImputed[12][dfImputed[12] == 'p'] = 'pp'
#g -> ggg
dfImputed[12][dfImputed[12] == 'g'] = 'ggg'
In [12]:
for column in [12, '0imp', '3imp','4imp','5imp','6imp']:
dummies = pd.get_dummies(dfImputed[column])
dfImputed[dummies.columns] = dummies
for x in [12, '0imp', '3imp','4imp','5imp','6imp']:
dfImputed.drop(x, axis=1, inplace=True)
dfImputed.info()
In [13]:
#this is slow but i think it is right, commenting it out because it takes a heck of a long time to run
#plotted = pd.scatter_matrix(dfImputed, figsize=(75,75))
In [20]:
#lets use all the values for our model!
keys = list(dfImputed.keys())
keys.remove(15) #removes the y
x_data = keys
In [21]:
x_train, x_test, y_train, y_test = train_test_split(dfImputed[x_data], dfImputed[15])
In [22]:
logReg = LogisticRegression()
In [23]:
logReg.fit(x_train, y_train)
Out[23]:
In [24]:
logReg.score(x_test, y_test)
Out[24]:
In [26]:
#a linear one!
linSVC = LinearSVC(C = 1e-3)
linSVC.fit(x_train, y_train)
linSVC.score(x_test, y_test)
Out[26]:
In [27]:
#using SVC!
#linear again! but different!
In [33]:
nlk = SVC()
nlk.fit(x_train, y_train)
Out[33]:
In [34]:
nlk.score(x_test,y_test)
Out[34]:
In [35]:
d = {}
d['C'] = np.logspace(-.3, 3., 100)
gs = GridSearchCV(LinearSVC(), d)
In [36]:
gs.fit(x_train, y_train)
Out[36]:
In [37]:
gs.best_estimator_
Out[37]:
In [38]:
gs.best_estimator_.score(x_test, y_test)
Out[38]:
In [39]:
#narrow the range for linear
d['C'] = np.logspace(2., 2.5, 20)
gs = GridSearchCV(LinearSVC(), d)
In [40]:
gs.fit(x_train, y_train)
gs.best_estimator_
Out[40]:
In [41]:
gs.best_estimator_.score(x_test, y_test)
Out[41]:
In [42]:
#well that is all kinda not great...
#lets try the rbf radial basis function
d = {}
d['C'] = np.logspace(-.3, 3., 100)
gs = GridSearchCV(SVC(), d)
gs.fit(x_train, y_train)
gs.best_estimator_
Out[42]:
In [43]:
gs.best_estimator_.score(x_test, y_test)
Out[43]:
In [44]:
d['C'] = np.logspace(-.3, 3., 10)
d['gamma'] = np.logspace(-.3, 3., 10)
gs = GridSearchCV(SVC(), d)
gs.fit(x_train, y_train)
gs.best_estimator_
Out[44]:
In [45]:
gs.best_estimator_.score(x_test, y_test)
Out[45]:
In [49]:
#we had multiple predictions to map with a matrix
#logReg
#linSVC
#nlk
#gs
y_pred = logReg.predict(x_test)
confusion_matrix(y_test, y_pred)
Out[49]:
In [50]:
y_pred = linSVC.predict(x_test)
confusion_matrix(y_test, y_pred)
Out[50]:
In [51]:
y_pred = nlk.predict(x_test)
confusion_matrix(y_test, y_pred)
Out[51]:
In [52]:
y_pred = gs.predict(x_test)
confusion_matrix(y_test, y_pred)
Out[52]: