In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# enables inline Plots
%matplotlib inline

In [3]:
# Limit rows displayed in notebook
pd.set_option('display.max_columns', None)

In [4]:
pd.set_option('display.max_rows', 100)

In [5]:
pd.set_option('display.precision', 3)

In [6]:
pd.set_option('display.mpl_style', 'default')

In [7]:
columns = ['A1', 'Age', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
df = pd.read_csv('../data/hw2_data.data', names = columns)

In [8]:
df.replace('?', np.nan, inplace = True)

In [9]:
df.A16.replace({'+' : 1, '-' : 0}, inplace = True)

In [10]:
df.A1.describe()


Out[10]:
count     678
unique      2
top         b
freq      468
Name: A1, dtype: object

Fill A1


In [11]:
from scipy import stats
counts = df.A1.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A1.count())
df.loc[df.A1.isnull(), "A1"] = counts.iloc[fill_idxs].index.values

In [11]:


In [12]:
df[df.A1.isnull()]


Out[12]:
A1 Age A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16

how to do it manually


In [13]:
#a_count = df.A1[df.A1 == 'a'].count()
#b_count = df.A1[df.A1 == 'b'].count()
#float( b_count)/(a_count + b_count)

In [14]:
# how to do it manually
# to fill categorical info based on percentage
# A1_fill = np.random.choice(('a','b'), size= 12, p = (.31, .69)) 
#df.loc[df.A1.isnull(), "A1"] = A1_fill

In [15]:
df.A1.describe()


Out[15]:
count     690
unique      2
top         b
freq      477
Name: A1, dtype: object

Fill Age2


In [16]:
df.Age = df.Age.astype(float)

In [17]:
print 'Mean Column:', df.Age.mean()
print 'Std Column:', df.Age.std()


Mean Column: 31.5681710914
Std Column: 11.9578624983

In [18]:
def get_age_impute_values(n):
    return np.random.normal(df.Age.mean(), df.Age.std(), n)

In [19]:
df.loc[df.Age.isnull(), 'Age'] = get_age_impute_values(12)

Dropping all nulls in A4 (those rows have multiple other null values)


In [20]:
df.dropna(subset=['A4'], how='all', inplace = True)

Same method for all nulls in A6, A7, A14 (they might be worth filling in...)


In [21]:
#A6
counts = df.A6.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A6.count())
df.loc[df.A6.isnull(), "A6"] = counts.iloc[fill_idxs].index.values
#A7
counts = df.A7.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A7.count())
df.loc[df.A7.isnull(), "A7"] = counts.iloc[fill_idxs].index.values
#A14
counts = df.A14.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A14.count())
df.loc[df.A14.isnull(), "A14"] = counts.iloc[fill_idxs].index.values

In [22]:
# All filled by random proportional representation
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Data columns (total 16 columns):
A1     684 non-null object
Age    684 non-null float64
A3     684 non-null float64
A4     684 non-null object
A5     684 non-null object
A6     684 non-null object
A7     684 non-null object
A8     684 non-null float64
A9     684 non-null object
A10    684 non-null object
A11    684 non-null int64
A12    684 non-null object
A13    684 non-null object
A14    684 non-null object
A15    684 non-null int64
A16    684 non-null int64
dtypes: float64(3), int64(3), object(10)
memory usage: 90.8+ KB

All Values Filled

PLOTS


In [23]:
df.Age.hist(bins = 50)


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x109eb2d90>

In [24]:
df.A1.value_counts().plot(kind = 'bar') #gender?


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a0f4c10>

In [25]:
df.A4.value_counts().plot(kind = 'bar')


Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a19bed0>

In [26]:
df.A5.value_counts().plot(kind = 'bar')


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a497390>

In [27]:
df.A6.value_counts().plot(kind = 'bar')


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a5b8410>

In [28]:
df.A7.value_counts().plot(kind = 'bar')


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a7d1650>

In [29]:
df.A9.value_counts().plot(kind = 'bar')


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a85ca10>

In [30]:
df.A10.value_counts().plot(kind = 'bar')


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x10a9c5cd0>

In [31]:
df.A12.value_counts().plot(kind = 'bar')


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ac125d0>

In [32]:
df.A13.value_counts().plot(kind = 'bar')


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ae2a610>

In [33]:
df.A14.value_counts().plot(kind = 'bar')


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b04d910>

In [34]:
df.groupby(['A4'])['A16'].mean()


Out[34]:
A4
l     1.00
u     0.49
y     0.28
Name: A16, dtype: float64

In [35]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Data columns (total 16 columns):
A1     684 non-null object
Age    684 non-null float64
A3     684 non-null float64
A4     684 non-null object
A5     684 non-null object
A6     684 non-null object
A7     684 non-null object
A8     684 non-null float64
A9     684 non-null object
A10    684 non-null object
A11    684 non-null int64
A12    684 non-null object
A13    684 non-null object
A14    684 non-null object
A15    684 non-null int64
A16    684 non-null int64
dtypes: float64(3), int64(3), object(10)
memory usage: 90.8+ KB

In [36]:
df.hist(bins = 50, figsize = (15, 15));



In [37]:
df.corr()


Out[37]:
Age A3 A8 A11 A15 A16
Age 1.00 0.21 0.40 0.19 0.02 0.16
A3 0.21 1.00 0.30 0.27 0.12 0.21
A8 0.40 0.30 1.00 0.32 0.05 0.33
A11 0.19 0.27 0.32 1.00 0.06 0.41
A15 0.02 0.12 0.05 0.06 1.00 0.18
A16 0.16 0.21 0.33 0.41 0.18 1.00

Dummies


In [38]:
dummy = pd.get_dummies(df)
dummy.drop('A16' , axis = 1, inplace = True)

In [39]:
dummy.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Columns: 215 entries, Age to A14_02000
dtypes: float64(213), int64(2)
memory usage: 1.1 MB

In [39]:


In [40]:
Target = df.A16.values

In [41]:
Features = dummy

train_test_split


In [42]:
from sklearn.cross_validation import train_test_split

In [43]:
X_train, X_test, y_train, y_test = train_test_split(Features, Target, random_state=3, test_size=0.2)

LogisticRegression


In [44]:
from sklearn.linear_model import LogisticRegression

In [45]:
clf = LogisticRegression()

In [46]:
clf.fit(X_train, y_train)


Out[46]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [47]:
clf.score(X_test, y_test)


Out[47]:
0.78832116788321172

confusion_matrix


In [48]:
from sklearn.metrics import confusion_matrix, classification_report

In [49]:
y_pred = clf.predict(X_test)

In [50]:
confusion_matrix(y_test, y_pred)


Out[50]:
array([[62, 20],
       [ 9, 46]])

In [51]:
print classification_report(y_test, y_pred)


             precision    recall  f1-score   support

          0       0.87      0.76      0.81        82
          1       0.70      0.84      0.76        55

avg / total       0.80      0.79      0.79       137

dropping low value coefficients


In [52]:
# coefficients of features in dummy
coef = pd.DataFrame(zip(dummy.columns, np.transpose(clf.coef_)))
coef.head()


Out[52]:
0 1
0 Age [-0.00808458116758]
1 A3 [0.0020650011804]
2 A8 [0.0316148522263]
3 A11 [0.156037903518]
4 A15 [0.000418988090299]

In [53]:
# renaming columns
coef.rename(columns={0 : 'features', 1 : 'coef'}, inplace = True)

In [54]:
coef[coef['coef'].abs().order(ascending = False) > 1.66]


/Users/David/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:1808: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)
Out[54]:
features coef
36 A9_f [-2.03109959242]
37 A9_t [1.7966322344]

In [55]:
# separating the low value coefficient
features_0 = coef.features[coef['coef'].abs().order(ascending = False) > .5].values
features_0


Out[55]:
array(['A6_aa', 'A6_cc', 'A6_d', 'A6_ff', 'A6_i', 'A6_x', 'A7_bb', 'A7_ff',
       'A7_n', 'A9_f', 'A9_t', 'A14_00000', 'A14_00132', 'A14_00140',
       'A14_00150', 'A14_00167', 'A14_00180', 'A14_00200', 'A14_00208',
       'A14_00232', 'A14_00288', 'A14_00329', 'A14_00400', 'A14_00420',
       'A14_00500', 'A14_00560', 'A14_00760', 'A14_00980'], dtype=object)

In [56]:
#droping the low value coefficients
dummy_0 = dummy.drop(features_0, axis = 1)

In [57]:
dummy_0.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Columns: 187 entries, Age to A14_02000
dtypes: float64(185), int64(2)
memory usage: 1004.6 KB

runing the model with less columns


In [58]:
Features_0 = dummy_0

In [59]:
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(Features_0, Target, random_state=3, test_size=0.2)

In [60]:
clf.fit(X_train_0, y_train_0)


Out[60]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

slightly worse score, different each time based on the random_state used


In [61]:
clf.score(X_test_0, y_test_0)


Out[61]:
0.81021897810218979

In [62]:
#seems like all the low coef features still help

Support Vector Machines


In [63]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC

In [64]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(Features_0, Target, random_state=3, test_size = 0.3)

In [65]:
est = LinearSVC(C=1e+1)

In [66]:
est.fit(X_train, y_train)


Out[66]:
LinearSVC(C=10.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

In [67]:
est.score(X_test, y_test)


Out[67]:
0.74452554744525545

GridSearch


In [68]:
d = {}
d['C'] = np.logspace(-3,3,10)
d['C']


Out[68]:
array([  1.00000000e-03,   4.64158883e-03,   2.15443469e-02,
         1.00000000e-01,   4.64158883e-01,   2.15443469e+00,
         1.00000000e+01,   4.64158883e+01,   2.15443469e+02,
         1.00000000e+03])

In [69]:
%%time
gs = GridSearchCV(LinearSVC(),d)
gs.fit(X_train, y_train)


CPU times: user 966 ms, sys: 5.35 ms, total: 971 ms
Wall time: 983 ms

In [70]:
gs.best_params_,gs.best_score_


Out[70]:
({'C': 0.10000000000000001}, 0.85740402193784282)

In [71]:
gs.score(X_train, y_train)


Out[71]:
0.92321755027422303

In [72]:
y_pred = gs.predict(X_test)

In [73]:
66/(9+66.)


Out[73]:
0.88

In [74]:
confusion_matrix(y_test, y_pred)


Out[74]:
array([[62, 20],
       [ 9, 46]])

In [75]:
print classification_report(y_test, y_pred)


             precision    recall  f1-score   support

          0       0.87      0.76      0.81        82
          1       0.70      0.84      0.76        55

avg / total       0.80      0.79      0.79       137

Non-Linear kernel


In [76]:
svc = SVC()

In [77]:
svc.fit(X_train, y_train)


Out[77]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [78]:
svc.score(X_train, y_train)


Out[78]:
0.85009140767824498

In [79]:
y_pred = svc.predict(X_test)

In [80]:
confusion_matrix(y_test, y_pred)


Out[80]:
array([[70, 12],
       [18, 37]])

In [81]:
print classification_report(y_test, y_pred)


             precision    recall  f1-score   support

          0       0.80      0.85      0.82        82
          1       0.76      0.67      0.71        55

avg / total       0.78      0.78      0.78       137


In [82]:
d = {}
d['C'] = np.logspace(-3, 3, 10)

grid search for SVC


In [83]:
%%time
gs = GridSearchCV(SVC(),d)
gs.fit(X_train, y_train)


CPU times: user 1.76 s, sys: 8.84 ms, total: 1.77 s
Wall time: 1.8 s

In [84]:
gs.best_params_,gs.best_score_


Out[84]:
({'C': 10.0}, 0.74223034734917737)

In [85]:
gs.score(X_train, y_train)


Out[85]:
0.94698354661791595

In [86]:
y_pred = gs.predict(X_test)

In [87]:
confusion_matrix(y_test, y_pred)


Out[87]:
array([[60, 22],
       [17, 38]])

In [88]:
print classification_report(y_test, y_pred)


             precision    recall  f1-score   support

          0       0.78      0.73      0.75        82
          1       0.63      0.69      0.66        55

avg / total       0.72      0.72      0.72       137


In [88]:


In [88]:


In [89]:
param = {'C' : np.logspace(-3,3,5), 'gamma' : np.logspace(-3, 3, 5)}
gs = GridSearchCV(SVC(), param)

In [90]:
%%time
gs.fit(X_train, y_train)


CPU times: user 4.04 s, sys: 12.5 ms, total: 4.06 s
Wall time: 4.09 s
Out[90]:
GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   3.16228e-02,   1.00000e+00,   3.16228e+01,
         1.00000e+03]), 'gamma': array([  1.00000e-03,   3.16228e-02,   1.00000e+00,   3.16228e+01,
         1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [91]:
gs.score(X_train, y_train)


Out[91]:
0.91224862888482627

In [92]:
y_pred = gs.predict(X_test)

confusion_matrix


In [93]:
confusion_matrix(y_test, y_pred)


Out[93]:
array([[58, 24],
       [17, 38]])

In [94]:
print classification_report(y_test, y_pred)


             precision    recall  f1-score   support

          0       0.77      0.71      0.74        82
          1       0.61      0.69      0.65        55

avg / total       0.71      0.70      0.70       137


In [94]:


In [94]:


In [94]:


In [94]:


In [94]:


In [94]:


In [94]:


In [94]:


In [94]:


In [94]:


In [ ]: