notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:

    
# enables inline Plots
%matplotlib inline



In [3]:

    
# Limit rows displayed in notebook
pd.set_option('display.max_columns', None)



In [4]:

    
pd.set_option('display.max_rows', 100)



In [5]:

    
pd.set_option('display.precision', 3)



In [6]:

    
pd.set_option('display.mpl_style', 'default')



In [7]:

    
columns = ['A1', 'Age', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
df = pd.read_csv('../data/hw2_data.data', names = columns)



In [8]:

    
df.replace('?', np.nan, inplace = True)



In [9]:

    
df.A16.replace({'+' : 1, '-' : 0}, inplace = True)



In [10]:

    
df.A1.describe()









    Out[10]:





count     678
unique      2
top         b
freq      468
Name: A1, dtype: object

Fill A1

filling categorical values by value counts (http://stackoverflow.com/questions/27474439/fill-multiple-nulls-for-categorical-data/27475194?noredirect=1#comment43393809_27475194)



In [11]:

    
from scipy import stats
counts = df.A1.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A1.count())
df.loc[df.A1.isnull(), "A1"] = counts.iloc[fill_idxs].index.values



In [11]:



In [12]:

    
df[df.A1.isnull()]

how to do it manually



In [13]:

    
#a_count = df.A1[df.A1 == 'a'].count()
#b_count = df.A1[df.A1 == 'b'].count()
#float( b_count)/(a_count + b_count)



In [14]:

    
# how to do it manually
# to fill categorical info based on percentage
# A1_fill = np.random.choice(('a','b'), size= 12, p = (.31, .69)) 
#df.loc[df.A1.isnull(), "A1"] = A1_fill



In [15]:

    
df.A1.describe()









    Out[15]:





count     690
unique      2
top         b
freq      477
Name: A1, dtype: object

Fill Age2



In [16]:

    
df.Age = df.Age.astype(float)



In [17]:

    
print 'Mean Column:', df.Age.mean()
print 'Std Column:', df.Age.std()









    



Mean Column: 31.5681710914
Std Column: 11.9578624983



In [18]:

    
def get_age_impute_values(n):
    return np.random.normal(df.Age.mean(), df.Age.std(), n)



In [19]:

    
df.loc[df.Age.isnull(), 'Age'] = get_age_impute_values(12)

Dropping all nulls in A4 (those rows have multiple other null values)



In [20]:

    
df.dropna(subset=['A4'], how='all', inplace = True)

Same method for all nulls in A6, A7, A14 (they might be worth filling in...)



In [21]:

    
#A6
counts = df.A6.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A6.count())
df.loc[df.A6.isnull(), "A6"] = counts.iloc[fill_idxs].index.values
#A7
counts = df.A7.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A7.count())
df.loc[df.A7.isnull(), "A7"] = counts.iloc[fill_idxs].index.values
#A14
counts = df.A14.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=df.shape[0] - df.A14.count())
df.loc[df.A14.isnull(), "A14"] = counts.iloc[fill_idxs].index.values



In [22]:

    
# All filled by random proportional representation
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Data columns (total 16 columns):
A1     684 non-null object
Age    684 non-null float64
A3     684 non-null float64
A4     684 non-null object
A5     684 non-null object
A6     684 non-null object
A7     684 non-null object
A8     684 non-null float64
A9     684 non-null object
A10    684 non-null object
A11    684 non-null int64
A12    684 non-null object
A13    684 non-null object
A14    684 non-null object
A15    684 non-null int64
A16    684 non-null int64
dtypes: float64(3), int64(3), object(10)
memory usage: 90.8+ KB

All Values Filled

PLOTS



In [23]:

    
df.Age.hist(bins = 50)









    Out[23]:





<matplotlib.axes._subplots.AxesSubplot at 0x109eb2d90>



In [24]:

    
df.A1.value_counts().plot(kind = 'bar') #gender?









    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a0f4c10>



In [25]:

    
df.A4.value_counts().plot(kind = 'bar')









    Out[25]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a19bed0>



In [26]:

    
df.A5.value_counts().plot(kind = 'bar')









    Out[26]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a497390>



In [27]:

    
df.A6.value_counts().plot(kind = 'bar')









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a5b8410>



In [28]:

    
df.A7.value_counts().plot(kind = 'bar')









    Out[28]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a7d1650>



In [29]:

    
df.A9.value_counts().plot(kind = 'bar')









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a85ca10>



In [30]:

    
df.A10.value_counts().plot(kind = 'bar')









    Out[30]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a9c5cd0>



In [31]:

    
df.A12.value_counts().plot(kind = 'bar')









    Out[31]:





<matplotlib.axes._subplots.AxesSubplot at 0x10ac125d0>



In [32]:

    
df.A13.value_counts().plot(kind = 'bar')









    Out[32]:





<matplotlib.axes._subplots.AxesSubplot at 0x10ae2a610>



In [33]:

    
df.A14.value_counts().plot(kind = 'bar')









    Out[33]:





<matplotlib.axes._subplots.AxesSubplot at 0x10b04d910>



In [34]:

    
df.groupby(['A4'])['A16'].mean()









    Out[34]:





A4
l     1.00
u     0.49
y     0.28
Name: A16, dtype: float64



In [35]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Data columns (total 16 columns):
A1     684 non-null object
Age    684 non-null float64
A3     684 non-null float64
A4     684 non-null object
A5     684 non-null object
A6     684 non-null object
A7     684 non-null object
A8     684 non-null float64
A9     684 non-null object
A10    684 non-null object
A11    684 non-null int64
A12    684 non-null object
A13    684 non-null object
A14    684 non-null object
A15    684 non-null int64
A16    684 non-null int64
dtypes: float64(3), int64(3), object(10)
memory usage: 90.8+ KB



In [36]:

    
df.hist(bins = 50, figsize = (15, 15));



In [37]:

    
df.corr()

Dummies



In [38]:

    
dummy = pd.get_dummies(df)
dummy.drop('A16' , axis = 1, inplace = True)



In [39]:

    
dummy.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Columns: 215 entries, Age to A14_02000
dtypes: float64(213), int64(2)
memory usage: 1.1 MB



In [39]:



In [40]:

    
Target = df.A16.values



In [41]:

    
Features = dummy

train_test_split



In [42]:

    
from sklearn.cross_validation import train_test_split



In [43]:

    
X_train, X_test, y_train, y_test = train_test_split(Features, Target, random_state=3, test_size=0.2)

LogisticRegression



In [44]:

    
from sklearn.linear_model import LogisticRegression



In [45]:

    
clf = LogisticRegression()



In [46]:

    
clf.fit(X_train, y_train)









    Out[46]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)



In [47]:

    
clf.score(X_test, y_test)









    Out[47]:





0.78832116788321172

confusion_matrix



In [48]:

    
from sklearn.metrics import confusion_matrix, classification_report



In [49]:

    
y_pred = clf.predict(X_test)



In [50]:

    
confusion_matrix(y_test, y_pred)









    Out[50]:





array([[62, 20],
       [ 9, 46]])



In [51]:

    
print classification_report(y_test, y_pred)









    



             precision    recall  f1-score   support

          0       0.87      0.76      0.81        82
          1       0.70      0.84      0.76        55

avg / total       0.80      0.79      0.79       137

dropping low value coefficients



In [52]:

    
# coefficients of features in dummy
coef = pd.DataFrame(zip(dummy.columns, np.transpose(clf.coef_)))
coef.head()









    Out[52]:






  
    
      
      0
      1
    
  
  
    
      0
       Age
       [-0.00808458116758]
    
    
      1
        A3
         [0.0020650011804]
    
    
      2
        A8
         [0.0316148522263]
    
    
      3
       A11
          [0.156037903518]
    
    
      4
       A15
       [0.000418988090299]



In [53]:

    
# renaming columns
coef.rename(columns={0 : 'features', 1 : 'coef'}, inplace = True)



In [54]:

    
coef[coef['coef'].abs().order(ascending = False) > 1.66]









    



/Users/David/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:1808: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)






    Out[54]:






  
    
      
      features
      coef
    
  
  
    
      36
       A9_f
       [-2.03109959242]
    
    
      37
       A9_t
         [1.7966322344]



In [55]:

    
# separating the low value coefficient
features_0 = coef.features[coef['coef'].abs().order(ascending = False) > .5].values
features_0









    Out[55]:





array(['A6_aa', 'A6_cc', 'A6_d', 'A6_ff', 'A6_i', 'A6_x', 'A7_bb', 'A7_ff',
       'A7_n', 'A9_f', 'A9_t', 'A14_00000', 'A14_00132', 'A14_00140',
       'A14_00150', 'A14_00167', 'A14_00180', 'A14_00200', 'A14_00208',
       'A14_00232', 'A14_00288', 'A14_00329', 'A14_00400', 'A14_00420',
       'A14_00500', 'A14_00560', 'A14_00760', 'A14_00980'], dtype=object)



In [56]:

    
#droping the low value coefficients
dummy_0 = dummy.drop(features_0, axis = 1)



In [57]:

    
dummy_0.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Columns: 187 entries, Age to A14_02000
dtypes: float64(185), int64(2)
memory usage: 1004.6 KB

runing the model with less columns



In [58]:

    
Features_0 = dummy_0



In [59]:

    
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(Features_0, Target, random_state=3, test_size=0.2)



In [60]:

    
clf.fit(X_train_0, y_train_0)









    Out[60]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

slightly worse score, different each time based on the random_state used



In [61]:

    
clf.score(X_test_0, y_test_0)









    Out[61]:





0.81021897810218979



In [62]:

    
#seems like all the low coef features still help

Support Vector Machines



In [63]:

    
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC



In [64]:

    
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(Features_0, Target, random_state=3, test_size = 0.3)



In [65]:

    
est = LinearSVC(C=1e+1)



In [66]:

    
est.fit(X_train, y_train)









    Out[66]:





LinearSVC(C=10.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)



In [67]:

    
est.score(X_test, y_test)









    Out[67]:





0.74452554744525545

GridSearch



In [68]:

    
d = {}
d['C'] = np.logspace(-3,3,10)
d['C']









    Out[68]:





array([  1.00000000e-03,   4.64158883e-03,   2.15443469e-02,
         1.00000000e-01,   4.64158883e-01,   2.15443469e+00,
         1.00000000e+01,   4.64158883e+01,   2.15443469e+02,
         1.00000000e+03])



In [69]:

    
%%time
gs = GridSearchCV(LinearSVC(),d)
gs.fit(X_train, y_train)









    



CPU times: user 966 ms, sys: 5.35 ms, total: 971 ms
Wall time: 983 ms



In [70]:

    
gs.best_params_,gs.best_score_









    Out[70]:





({'C': 0.10000000000000001}, 0.85740402193784282)



In [71]:

    
gs.score(X_train, y_train)









    Out[71]:





0.92321755027422303



In [72]:

    
y_pred = gs.predict(X_test)



In [73]:

    
66/(9+66.)









    Out[73]:





0.88



In [74]:

    
confusion_matrix(y_test, y_pred)









    Out[74]:





array([[62, 20],
       [ 9, 46]])



In [75]:

    
print classification_report(y_test, y_pred)









    



             precision    recall  f1-score   support

          0       0.87      0.76      0.81        82
          1       0.70      0.84      0.76        55

avg / total       0.80      0.79      0.79       137

Non-Linear kernel



In [76]:

    
svc = SVC()



In [77]:

    
svc.fit(X_train, y_train)









    Out[77]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [78]:

    
svc.score(X_train, y_train)









    Out[78]:





0.85009140767824498



In [79]:

    
y_pred = svc.predict(X_test)



In [80]:

    
confusion_matrix(y_test, y_pred)









    Out[80]:





array([[70, 12],
       [18, 37]])



In [81]:

    
print classification_report(y_test, y_pred)









    



             precision    recall  f1-score   support

          0       0.80      0.85      0.82        82
          1       0.76      0.67      0.71        55

avg / total       0.78      0.78      0.78       137



In [82]:

    
d = {}
d['C'] = np.logspace(-3, 3, 10)

grid search for SVC



In [83]:

    
%%time
gs = GridSearchCV(SVC(),d)
gs.fit(X_train, y_train)









    



CPU times: user 1.76 s, sys: 8.84 ms, total: 1.77 s
Wall time: 1.8 s



In [84]:

    
gs.best_params_,gs.best_score_









    Out[84]:





({'C': 10.0}, 0.74223034734917737)



In [85]:

    
gs.score(X_train, y_train)









    Out[85]:





0.94698354661791595



In [86]:

    
y_pred = gs.predict(X_test)



In [87]:

    
confusion_matrix(y_test, y_pred)









    Out[87]:





array([[60, 22],
       [17, 38]])



In [88]:

    
print classification_report(y_test, y_pred)









    



             precision    recall  f1-score   support

          0       0.78      0.73      0.75        82
          1       0.63      0.69      0.66        55

avg / total       0.72      0.72      0.72       137



In [88]:



In [88]:



In [89]:

    
param = {'C' : np.logspace(-3,3,5), 'gamma' : np.logspace(-3, 3, 5)}
gs = GridSearchCV(SVC(), param)



In [90]:

    
%%time
gs.fit(X_train, y_train)









    



CPU times: user 4.04 s, sys: 12.5 ms, total: 4.06 s
Wall time: 4.09 s






    Out[90]:





GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   3.16228e-02,   1.00000e+00,   3.16228e+01,
         1.00000e+03]), 'gamma': array([  1.00000e-03,   3.16228e-02,   1.00000e+00,   3.16228e+01,
         1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [91]:

    
gs.score(X_train, y_train)









    Out[91]:





0.91224862888482627



In [92]:

    
y_pred = gs.predict(X_test)

confusion_matrix



In [93]:

    
confusion_matrix(y_test, y_pred)









    Out[93]:





array([[58, 24],
       [17, 38]])



In [94]:

    
print classification_report(y_test, y_pred)









    



             precision    recall  f1-score   support

          0       0.77      0.71      0.74        82
          1       0.61      0.69      0.65        55

avg / total       0.71      0.70      0.70       137



In [94]:



In [94]:



In [94]:



In [94]:



In [94]:



In [94]:



In [94]:



In [94]:



In [94]:



In [94]:



In [ ]:

	Age	A3	A8	A11	A15	A16
Age	1.00	0.21	0.40	0.19	0.02	0.16
A3	0.21	1.00	0.30	0.27	0.12	0.21
A8	0.40	0.30	1.00	0.32	0.05	0.33
A11	0.19	0.27	0.32	1.00	0.06	0.41
A15	0.02	0.12	0.05	0.06	1.00	0.18
A16	0.16	0.21	0.33	0.41	0.18	1.00

	0	1
0	Age	[-0.00808458116758]
1	A3	[0.0020650011804]
2	A8	[0.0316148522263]
3	A11	[0.156037903518]
4	A15	[0.000418988090299]