In [74]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Enable inline Plots
%matplotlib inline

# Limit rows displayed in notebook
#pd.set_option('display.max_rows', 10)
#pd.set_option('display.precision', 2)

In [75]:
columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
data = pd.read_csv('../hw2/crx.data', names = columns)

In [76]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB

In [77]:
data.describe()


Out[77]:
A3 A8 A11 A15
count 690.000000 690.000000 690.00000 690.000000
mean 4.758725 2.223406 2.40000 1017.385507
std 4.978163 3.346513 4.86294 5210.102598
min 0.000000 0.000000 0.00000 0.000000
25% 1.000000 0.165000 0.00000 0.000000
50% 2.750000 1.000000 0.00000 5.000000
75% 7.207500 2.625000 3.00000 395.500000
max 28.000000 28.500000 67.00000 100000.000000

In [78]:
data.replace('?', np.nan, inplace = True)

In [79]:
data.A16.replace({'+':1, '-':0}, inplace = True)

In [80]:
data.head()


Out[80]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 1
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 1
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 1
3 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 1
4 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 1

In [81]:
## fill A1
a_count = data.A1[data.A1 == 'a'].count()
b_count = data.A1[data.A1 == 'b'].count()
# %b
float(b_count)/(a_count + b_count)


Out[81]:
0.6902654867256637

In [82]:
# figure out how many null values are in A1
data[data.A1.isnull()]
#data[data.A1.isnull()].count()
# ex. for column A2, there are 12 results in A1 that are null


Out[82]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
248 NaN 24.50 12.750 u g c bb 4.750 t t 2 f g 00073 444 1
327 NaN 40.83 3.500 u g i bb 0.500 f f 0 f s 01160 0 0
346 NaN 32.25 1.500 u g c v 0.250 f f 0 t g 00372 122 0
374 NaN 28.17 0.585 u g aa v 0.040 f f 0 f g 00260 1004 0
453 NaN 29.75 0.665 u g w v 0.250 f f 0 t g 00300 0 0
479 NaN 26.50 2.710 y p NaN NaN 0.085 f f 0 f s 00080 0 0
489 NaN 45.33 1.000 u g q v 0.125 f f 0 t g 00263 0 0
520 NaN 20.42 7.500 u g k v 1.500 t t 1 f g 00160 234 1
598 NaN 20.08 0.125 u g q v 1.000 f t 1 f g 00240 768 1
601 NaN 42.25 1.750 y p NaN NaN 0.000 f f 0 t g 00150 1 0
641 NaN 33.17 2.250 y p cc v 3.500 f f 0 t g 00200 141 0
673 NaN 29.50 2.000 y p e h 2.000 f f 0 f g 00256 17 0

In [83]:
# fill null values in A1 based on % distribution of a and b
# size = 12 bc that's the number of null values in A1
A1_fillna = np.random.choice(('a', 'b'), size = 12, p = (.31, .69))

In [84]:
data.loc[data.A1.isnull(), 'A1'] = A1_fillna

In [85]:
## fill A2
# they're all numbers but listed as objects --> change to numbers
data.A2 = data.A2.astype(float)
# get mean and std dev so we can fill null values
print 'Mean: ' , data.A2.mean()
print 'Std dev: ', data.A2.std()


Mean:  31.5681710914
Std dev:  11.9578624983

In [86]:
# create a function that will impute the null values with values within 1 std dev of the mean
def get_A2_impute_values(n):
    return np.random.normal(data.A2.mean(), data.A2.std(), n)

In [87]:
data.loc[data.A2.isnull(), 'A2'] = get_A2_impute_values(12)

In [88]:
## null values in A4

data[data.A4.isnull()]
#these 6 people have a lot of columns with missing data


Out[88]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
206 a 71.58 0 NaN NaN NaN NaN 0 f f 0 f p NaN 0 1
270 b 37.58 0 NaN NaN NaN NaN 0 f f 0 f p NaN 0 1
330 b 20.42 0 NaN NaN NaN NaN 0 f f 0 f p NaN 0 0
456 b 34.58 0 NaN NaN NaN NaN 0 f f 0 f p NaN 0 0
592 b 23.17 0 NaN NaN NaN NaN 0 f f 0 f p NaN 0 1
622 a 25.58 0 NaN NaN NaN NaN 0 f f 0 f p NaN 0 1

In [89]:
# if we remove them, will it be a very significant change?
approved_count = data.A16[data.A16 == 1].count()
notapproved_count = data.A16[data.A16 == 0].count()

print float(approved_count-4)/approved_count
print float(notapproved_count-2)/notapproved_count
# based on the percentages for when we remove these it appears it is not
#a huge deal to remove them


0.986970684039
0.994778067885

In [90]:
# drop all rows where value in A4 is null
data.dropna(subset=['A4'], how='all', inplace = True)

In [91]:
## null values in A6 and 7
data[data.A6.isnull()]


Out[91]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
479 a 26.50 2.71 y p NaN NaN 0.085 f f 0 f s 00080 0 0
539 b 80.25 5.50 u g NaN NaN 0.540 t f 0 f g 00000 340 0
601 b 42.25 1.75 y p NaN NaN 0.000 f f 0 t g 00150 1 0

In [92]:
sum(data.A6.value_counts())


Out[92]:
681

In [93]:
data.A6.value_counts()


Out[93]:
c     137
q      78
w      64
i      59
aa     54
ff     53
k      51
cc     41
x      38
m      38
d      30
e      25
j      10
r       3
dtype: int64

In [94]:
from scipy import stats

counts = data.A6.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=data.shape[0] - data.A6.count())
data.loc[data.A6.isnull(), "A6"] = counts.iloc[fill_idxs].index.values

data.A6.value_counts()


Out[94]:
c     137
q      79
w      64
i      59
aa     55
ff     53
k      51
cc     41
x      38
m      38
d      30
e      26
j      10
r       3
dtype: int64

In [95]:
data[data.A7.isnull()]


Out[95]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
479 a 26.50 2.71 y p q NaN 0.085 f f 0 f s 00080 0 0
539 b 80.25 5.50 u g e NaN 0.540 t f 0 f g 00000 340 0
601 b 42.25 1.75 y p aa NaN 0.000 f f 0 t g 00150 1 0

In [96]:
data.A7.value_counts()


Out[96]:
v     399
h     138
bb     59
ff     57
z       8
j       8
dd      6
n       4
o       2
dtype: int64

In [97]:
# based on the distribution we see, we're just going to take the mode again
counts = data.A7.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=data.shape[0] - data.A7.count())
data.loc[data.A7.isnull(), "A7"] = counts.iloc[fill_idxs].index.values

data.A7.value_counts()


Out[97]:
v     400
h     138
bb     60
ff     58
z       8
j       8
dd      6
n       4
o       2
dtype: int64

In [98]:
data.info()
# based on this, we determine the only remaining column with null values is A14


<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Data columns (total 16 columns):
A1     684 non-null object
A2     684 non-null float64
A3     684 non-null float64
A4     684 non-null object
A5     684 non-null object
A6     684 non-null object
A7     684 non-null object
A8     684 non-null float64
A9     684 non-null object
A10    684 non-null object
A11    684 non-null int64
A12    684 non-null object
A13    684 non-null object
A14    677 non-null object
A15    684 non-null int64
A16    684 non-null int64
dtypes: float64(3), int64(3), object(10)
memory usage: 90.8+ KB

In [99]:
data[data.A14.isnull()]


Out[99]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
71 b 34.830000 4.000 u g d bb 12.500 t f 0 t g NaN 0 0
202 b 24.830000 2.750 u g c v 2.250 t t 6 f g NaN 600 1
243 a 18.750000 7.500 u g q v 2.710 t t 5 f g NaN 26726 1
278 b 24.580000 13.500 y p ff ff 0.000 f f 0 f g NaN 0 0
406 a 40.330000 8.125 y p k v 0.165 f t 2 f g NaN 18 0
445 a 37.430226 11.250 u g ff ff 0.000 f f 0 f g NaN 5200 0
626 b 22.000000 7.835 y p i bb 0.165 f f 0 t g NaN 0 0

In [100]:
data.A14.value_counts()


Out[100]:
00000    132
00200     35
00120     35
00160     34
00080     30
00100     30
00280     22
00180     18
00140     16
00240     14
00320     14
00300     13
00260     11
00220      9
00400      9
...
00980    1
00256    1
00470    1
00263    1
00431    1
00375    1
00434    1
00372    1
00276    1
00371    1
00075    1
00432    1
00333    1
00519    1
00017    1
Length: 170, dtype: int64

In [101]:
# looks like we're going to use the mode again...
data.loc[data.A14.isnull(), 'A14'] = [data.A14.mode()]
data.A14.value_counts()


Out[101]:
00000    139
00200     35
00120     35
00160     34
00080     30
00100     30
00280     22
00180     18
00140     16
00240     14
00320     14
00300     13
00260     11
00220      9
00400      9
...
00980    1
00256    1
00470    1
00263    1
00431    1
00375    1
00434    1
00372    1
00276    1
00371    1
00075    1
00432    1
00333    1
00519    1
00017    1
Length: 170, dtype: int64

In [102]:
data.describe()


Out[102]:
A2 A3 A8 A11 A15 A16
count 684.000000 684.000000 684.000000 684.000000 684.000000 684.000000
mean 31.489777 4.800468 2.242909 2.421053 1026.309942 0.442982
std 11.975893 4.979873 3.354657 4.879025 5232.060836 0.497102
min -3.499798 0.000000 0.000000 0.000000 0.000000 0.000000
25% 22.580000 1.030000 0.165000 0.000000 0.000000 0.000000
50% 28.500000 2.855000 1.000000 0.000000 5.000000 0.000000
75% 38.250000 7.500000 2.720000 3.000000 400.000000 1.000000
max 80.250000 28.000000 28.500000 67.000000 100000.000000 1.000000

In [103]:
'''
Now is probably the time to split this into train/test...
I got a little overwhelmed with: 
1. When I would add dummies & how that might impact things
2. If you add dummies and then split, how do you do exploratory plots?

I guess you could split into a theoretical training set and then later
add my dummies to the original whole dataset and train-test-split to do
the modeling...
'''


Out[103]:
'\nNow is probably the time to split this into train/test...\nI got a little overwhelmed with: \n1. When I would add dummies & how that might impact things\n2. If you add dummies and then split, how do you do exploratory plots?\n\nI guess you could split into a theoretical training set and then later\nadd my dummies to the original whole dataset and train-test-split to do\nthe modeling...\n'

In [104]:
## PLOT STUFF TO SEE WHAT IT LOOKS LIKE

data.A2.hist(bins = 50)


Out[104]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cb42350>

In [105]:
data.A1.value_counts().plot(kind = 'bar')


Out[105]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cc2b1d0>

In [106]:
data.A4.value_counts().plot(kind = 'bar')


Out[106]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cd69b50>

In [107]:
data.A5.value_counts().plot(kind = 'bar')


Out[107]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cf6ced0>

In [108]:
data.A6.value_counts().plot(kind = 'bar')


Out[108]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d081490>

In [109]:
data.A7.value_counts().plot(kind = 'bar')


Out[109]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d21e310>

In [110]:
data.A8.value_counts().plot(kind = 'bar')


Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d32f2d0>

In [111]:
data.A9.value_counts().plot(kind = 'bar')


Out[111]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d7cff10>

In [112]:
data.A10.value_counts().plot(kind = 'bar')


Out[112]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e9db0d0>

In [113]:
data.A11.value_counts().plot(kind = 'bar')


Out[113]:
<matplotlib.axes._subplots.AxesSubplot at 0x10de01910>

In [114]:
data.A12.value_counts().plot(kind = 'bar')


Out[114]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ed23f10>

In [115]:
data.A13.value_counts().plot(kind = 'bar')


Out[115]:
<matplotlib.axes._subplots.AxesSubplot at 0x10eda6150>

In [116]:
data.A14.value_counts().plot(kind = 'bar')


Out[116]:
<matplotlib.axes._subplots.AxesSubplot at 0x10efc0b50>

In [117]:
data.A15.value_counts().plot(kind = 'bar')


Out[117]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f209590>

In [118]:
data.groupby(['A4'])['A16'].mean()


Out[118]:
A4
l     1.000000
u     0.493256
y     0.276074
Name: A16, dtype: float64

In [119]:
data.hist(bins = 50, figsize = (15,15));



In [120]:
data.corr()


Out[120]:
A2 A3 A8 A11 A15 A16
A2 1.000000 0.204907 0.393249 0.187415 0.019778 0.149974
A3 0.204907 1.000000 0.295079 0.268428 0.121992 0.211902
A8 0.393249 0.295079 1.000000 0.320413 0.050311 0.327280
A11 0.187415 0.268428 0.320413 1.000000 0.062924 0.410751
A15 0.019778 0.121992 0.050311 0.062924 1.000000 0.177302
A16 0.149974 0.211902 0.327280 0.410751 0.177302 1.000000

In [121]:
from pandas.tools.plotting import scatter_matrix

In [122]:
scatter_matrix(data, figsize = (20,20));



In [123]:
sns.factorplot("A1", hue = "A4", data = data, kind = "bar", palette = "Greens_d", size = 5);



In [124]:
sns.factorplot("A1", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5);



In [125]:
sns.factorplot("A4", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5);



In [126]:
sns.factorplot("A9", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5)


Out[126]:
<seaborn.axisgrid.FacetGrid at 0x116fba610>

In [127]:
sns.factorplot("A10", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5)


Out[127]:
<seaborn.axisgrid.FacetGrid at 0x1170a1690>

In [128]:
sns.factorplot("A12", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5)


Out[128]:
<seaborn.axisgrid.FacetGrid at 0x116368f90>

In [129]:
sns.factorplot("A9", hue = "A10", data = data, kind = "bar", palette = "Greens_d", size = 5)


Out[129]:
<seaborn.axisgrid.FacetGrid at 0x1119d3690>

In [130]:
## DUMMIES

dummy = pd.get_dummies(data)
dummy.drop('A16', axis = 1, inplace = True)

In [131]:
dummy.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Columns: 215 entries, A2 to A14_02000
dtypes: float64(213), int64(2)
memory usage: 1.1 MB

In [132]:
Target = data.A16.values

In [133]:
Features = dummy

In [134]:
## TRAIN TEST SPLIT
f_train, f_test, t_train, t_test = train_test_split(Features,
                                                    Target, random_state = 3, 
                                                    test_size = .3)

In [135]:
## LOGISTIC REGRESSION

lr = LogisticRegression()

In [136]:
lr.fit(f_train, t_train)


Out[136]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [137]:
lr.score(f_test, t_test)


Out[137]:
0.81067961165048541

In [138]:
## CONFUSION MATRIX

t_pred = lr.predict(f_test)

In [139]:
confusion_matrix(t_test, t_pred)


Out[139]:
array([[89, 23],
       [16, 78]])

In [140]:
print classification_report(t_test, t_pred)


             precision    recall  f1-score   support

          0       0.85      0.79      0.82       112
          1       0.77      0.83      0.80        94

avg / total       0.81      0.81      0.81       206


In [141]:
## SVM

est = LinearSVC(C=1e+1)

In [142]:
est.fit(f_train, t_train)


Out[142]:
LinearSVC(C=10.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

In [143]:
est.score(f_test, t_test) 
#the score is consistently in the .7 to .8 range when the split is .2 instead of .3


Out[143]:
0.57766990291262132

In [144]:
## GRID SEARCH
d = {}
d['C'] = np.logspace(-3,-3,10)

In [145]:
gs = GridSearchCV(LinearSVC(),d)
gs.fit(f_train, t_train)


Out[145]:
GridSearchCV(cv=None,
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([ 0.001,  0.001,  0.001,  0.001,  0.001,  0.001,  0.001,  0.001,
        0.001,  0.001])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [146]:
gs.best_params_, gs.best_score_


Out[146]:
({'C': 0.001}, 0.81380753138075312)

In [147]:
gs.score(f_train, t_train)


Out[147]:
0.82635983263598323

In [148]:
t_pred = gs.predict(f_test)

In [149]:
confusion_matrix(t_test, t_pred)


Out[149]:
array([[101,  11],
       [ 27,  67]])

In [150]:
print classification_report(t_test, t_pred)


             precision    recall  f1-score   support

          0       0.79      0.90      0.84       112
          1       0.86      0.71      0.78        94

avg / total       0.82      0.82      0.81       206


In [151]:
## SVC - the black box that is the nonlinear kernel

svc = SVC()

In [152]:
svc.fit(f_train, t_train)


Out[152]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [153]:
svc.score(f_train, t_train)


Out[153]:
0.84937238493723854

In [154]:
t_pred = svc.predict(f_test)

In [155]:
confusion_matrix(t_test, t_pred)


Out[155]:
array([[97, 15],
       [30, 64]])

In [156]:
print classification_report(t_test, t_pred)


             precision    recall  f1-score   support

          0       0.76      0.87      0.81       112
          1       0.81      0.68      0.74        94

avg / total       0.78      0.78      0.78       206


In [157]:
d = {}
d['C'] = np.logspace(-3,3,10)

In [158]:
gs = GridSearchCV(SVC(),d)
gs.fit(f_train, t_train)


Out[158]:
GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [159]:
gs.best_params_, gs.best_score_


Out[159]:
({'C': 10.0}, 0.7615062761506276)

In [160]:
gs.score(f_train, t_train)


Out[160]:
0.95188284518828448

In [161]:
t_pred = gs.predict(f_test)

In [162]:
confusion_matrix(t_test, t_pred)


Out[162]:
array([[88, 24],
       [26, 68]])

In [163]:
print classification_report(t_test, t_pred)


             precision    recall  f1-score   support

          0       0.77      0.79      0.78       112
          1       0.74      0.72      0.73        94

avg / total       0.76      0.76      0.76       206


In [164]:
## Using a grid search on values of C and gamma to plot the BEST decision function
param = {'C':np.logspace(-3,3,10), 'gamma' : np.logspace(-3, 3, 5)}
gs = GridSearchCV(SVC(), param)

In [165]:
gs.fit(f_train, t_train)


Out[165]:
GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03]), 'gamma': array([  1.00000e-03,   3.16228e-02,   1.00000e+00,   3.16228e+01,
         1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [166]:
gs.score(f_train, t_train)


Out[166]:
0.93933054393305437

In [167]:
t_pred = gs.predict(f_test)

In [168]:
confusion_matrix(t_test, t_pred)


Out[168]:
array([[85, 27],
       [24, 70]])

In [169]:
print classification_report(t_test, t_pred)


             precision    recall  f1-score   support

          0       0.78      0.76      0.77       112
          1       0.72      0.74      0.73        94

avg / total       0.75      0.75      0.75       206


In [169]:


In [169]:


In [ ]: