In [74]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
# Enable inline Plots
%matplotlib inline
# Limit rows displayed in notebook
#pd.set_option('display.max_rows', 10)
#pd.set_option('display.precision', 2)
In [75]:
columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
data = pd.read_csv('../hw2/crx.data', names = columns)
In [76]:
data.info()
In [77]:
data.describe()
Out[77]:
In [78]:
data.replace('?', np.nan, inplace = True)
In [79]:
data.A16.replace({'+':1, '-':0}, inplace = True)
In [80]:
data.head()
Out[80]:
In [81]:
## fill A1
a_count = data.A1[data.A1 == 'a'].count()
b_count = data.A1[data.A1 == 'b'].count()
# %b
float(b_count)/(a_count + b_count)
Out[81]:
In [82]:
# figure out how many null values are in A1
data[data.A1.isnull()]
#data[data.A1.isnull()].count()
# ex. for column A2, there are 12 results in A1 that are null
Out[82]:
In [83]:
# fill null values in A1 based on % distribution of a and b
# size = 12 bc that's the number of null values in A1
A1_fillna = np.random.choice(('a', 'b'), size = 12, p = (.31, .69))
In [84]:
data.loc[data.A1.isnull(), 'A1'] = A1_fillna
In [85]:
## fill A2
# they're all numbers but listed as objects --> change to numbers
data.A2 = data.A2.astype(float)
# get mean and std dev so we can fill null values
print 'Mean: ' , data.A2.mean()
print 'Std dev: ', data.A2.std()
In [86]:
# create a function that will impute the null values with values within 1 std dev of the mean
def get_A2_impute_values(n):
return np.random.normal(data.A2.mean(), data.A2.std(), n)
In [87]:
data.loc[data.A2.isnull(), 'A2'] = get_A2_impute_values(12)
In [88]:
## null values in A4
data[data.A4.isnull()]
#these 6 people have a lot of columns with missing data
Out[88]:
In [89]:
# if we remove them, will it be a very significant change?
approved_count = data.A16[data.A16 == 1].count()
notapproved_count = data.A16[data.A16 == 0].count()
print float(approved_count-4)/approved_count
print float(notapproved_count-2)/notapproved_count
# based on the percentages for when we remove these it appears it is not
#a huge deal to remove them
In [90]:
# drop all rows where value in A4 is null
data.dropna(subset=['A4'], how='all', inplace = True)
In [91]:
## null values in A6 and 7
data[data.A6.isnull()]
Out[91]:
In [92]:
sum(data.A6.value_counts())
Out[92]:
In [93]:
data.A6.value_counts()
Out[93]:
In [94]:
from scipy import stats
counts = data.A6.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]),
counts/counts.sum()))
fill_idxs = dist.rvs(size=data.shape[0] - data.A6.count())
data.loc[data.A6.isnull(), "A6"] = counts.iloc[fill_idxs].index.values
data.A6.value_counts()
Out[94]:
In [95]:
data[data.A7.isnull()]
Out[95]:
In [96]:
data.A7.value_counts()
Out[96]:
In [97]:
# based on the distribution we see, we're just going to take the mode again
counts = data.A7.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]),
counts/counts.sum()))
fill_idxs = dist.rvs(size=data.shape[0] - data.A7.count())
data.loc[data.A7.isnull(), "A7"] = counts.iloc[fill_idxs].index.values
data.A7.value_counts()
Out[97]:
In [98]:
data.info()
# based on this, we determine the only remaining column with null values is A14
In [99]:
data[data.A14.isnull()]
Out[99]:
In [100]:
data.A14.value_counts()
Out[100]:
In [101]:
# looks like we're going to use the mode again...
data.loc[data.A14.isnull(), 'A14'] = [data.A14.mode()]
data.A14.value_counts()
Out[101]:
In [102]:
data.describe()
Out[102]:
In [103]:
'''
Now is probably the time to split this into train/test...
I got a little overwhelmed with:
1. When I would add dummies & how that might impact things
2. If you add dummies and then split, how do you do exploratory plots?
I guess you could split into a theoretical training set and then later
add my dummies to the original whole dataset and train-test-split to do
the modeling...
'''
Out[103]:
In [104]:
## PLOT STUFF TO SEE WHAT IT LOOKS LIKE
data.A2.hist(bins = 50)
Out[104]:
In [105]:
data.A1.value_counts().plot(kind = 'bar')
Out[105]:
In [106]:
data.A4.value_counts().plot(kind = 'bar')
Out[106]:
In [107]:
data.A5.value_counts().plot(kind = 'bar')
Out[107]:
In [108]:
data.A6.value_counts().plot(kind = 'bar')
Out[108]:
In [109]:
data.A7.value_counts().plot(kind = 'bar')
Out[109]:
In [110]:
data.A8.value_counts().plot(kind = 'bar')
Out[110]:
In [111]:
data.A9.value_counts().plot(kind = 'bar')
Out[111]:
In [112]:
data.A10.value_counts().plot(kind = 'bar')
Out[112]:
In [113]:
data.A11.value_counts().plot(kind = 'bar')
Out[113]:
In [114]:
data.A12.value_counts().plot(kind = 'bar')
Out[114]:
In [115]:
data.A13.value_counts().plot(kind = 'bar')
Out[115]:
In [116]:
data.A14.value_counts().plot(kind = 'bar')
Out[116]:
In [117]:
data.A15.value_counts().plot(kind = 'bar')
Out[117]:
In [118]:
data.groupby(['A4'])['A16'].mean()
Out[118]:
In [119]:
data.hist(bins = 50, figsize = (15,15));
In [120]:
data.corr()
Out[120]:
In [121]:
from pandas.tools.plotting import scatter_matrix
In [122]:
scatter_matrix(data, figsize = (20,20));
In [123]:
sns.factorplot("A1", hue = "A4", data = data, kind = "bar", palette = "Greens_d", size = 5);
In [124]:
sns.factorplot("A1", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5);
In [125]:
sns.factorplot("A4", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5);
In [126]:
sns.factorplot("A9", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5)
Out[126]:
In [127]:
sns.factorplot("A10", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5)
Out[127]:
In [128]:
sns.factorplot("A12", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5)
Out[128]:
In [129]:
sns.factorplot("A9", hue = "A10", data = data, kind = "bar", palette = "Greens_d", size = 5)
Out[129]:
In [130]:
## DUMMIES
dummy = pd.get_dummies(data)
dummy.drop('A16', axis = 1, inplace = True)
In [131]:
dummy.info()
In [132]:
Target = data.A16.values
In [133]:
Features = dummy
In [134]:
## TRAIN TEST SPLIT
f_train, f_test, t_train, t_test = train_test_split(Features,
Target, random_state = 3,
test_size = .3)
In [135]:
## LOGISTIC REGRESSION
lr = LogisticRegression()
In [136]:
lr.fit(f_train, t_train)
Out[136]:
In [137]:
lr.score(f_test, t_test)
Out[137]:
In [138]:
## CONFUSION MATRIX
t_pred = lr.predict(f_test)
In [139]:
confusion_matrix(t_test, t_pred)
Out[139]:
In [140]:
print classification_report(t_test, t_pred)
In [141]:
## SVM
est = LinearSVC(C=1e+1)
In [142]:
est.fit(f_train, t_train)
Out[142]:
In [143]:
est.score(f_test, t_test)
#the score is consistently in the .7 to .8 range when the split is .2 instead of .3
Out[143]:
In [144]:
## GRID SEARCH
d = {}
d['C'] = np.logspace(-3,-3,10)
In [145]:
gs = GridSearchCV(LinearSVC(),d)
gs.fit(f_train, t_train)
Out[145]:
In [146]:
gs.best_params_, gs.best_score_
Out[146]:
In [147]:
gs.score(f_train, t_train)
Out[147]:
In [148]:
t_pred = gs.predict(f_test)
In [149]:
confusion_matrix(t_test, t_pred)
Out[149]:
In [150]:
print classification_report(t_test, t_pred)
In [151]:
## SVC - the black box that is the nonlinear kernel
svc = SVC()
In [152]:
svc.fit(f_train, t_train)
Out[152]:
In [153]:
svc.score(f_train, t_train)
Out[153]:
In [154]:
t_pred = svc.predict(f_test)
In [155]:
confusion_matrix(t_test, t_pred)
Out[155]:
In [156]:
print classification_report(t_test, t_pred)
In [157]:
d = {}
d['C'] = np.logspace(-3,3,10)
In [158]:
gs = GridSearchCV(SVC(),d)
gs.fit(f_train, t_train)
Out[158]:
In [159]:
gs.best_params_, gs.best_score_
Out[159]:
In [160]:
gs.score(f_train, t_train)
Out[160]:
In [161]:
t_pred = gs.predict(f_test)
In [162]:
confusion_matrix(t_test, t_pred)
Out[162]:
In [163]:
print classification_report(t_test, t_pred)
In [164]:
## Using a grid search on values of C and gamma to plot the BEST decision function
param = {'C':np.logspace(-3,3,10), 'gamma' : np.logspace(-3, 3, 5)}
gs = GridSearchCV(SVC(), param)
In [165]:
gs.fit(f_train, t_train)
Out[165]:
In [166]:
gs.score(f_train, t_train)
Out[166]:
In [167]:
t_pred = gs.predict(f_test)
In [168]:
confusion_matrix(t_test, t_pred)
Out[168]:
In [169]:
print classification_report(t_test, t_pred)
In [169]:
In [169]:
In [ ]: