notebook.community

Edit and run



In [74]:

    
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Enable inline Plots
%matplotlib inline

# Limit rows displayed in notebook
#pd.set_option('display.max_rows', 10)
#pd.set_option('display.precision', 2)



In [75]:

    
columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
data = pd.read_csv('../hw2/crx.data', names = columns)



In [76]:

    
data.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB



In [77]:

    
data.describe()









    Out[77]:






  
    
      
      A3
      A8
      A11
      A15
    
  
  
    
      count
       690.000000
       690.000000
       690.00000
          690.000000
    
    
      mean
         4.758725
         2.223406
         2.40000
         1017.385507
    
    
      std
         4.978163
         3.346513
         4.86294
         5210.102598
    
    
      min
         0.000000
         0.000000
         0.00000
            0.000000
    
    
      25%
         1.000000
         0.165000
         0.00000
            0.000000
    
    
      50%
         2.750000
         1.000000
         0.00000
            5.000000
    
    
      75%
         7.207500
         2.625000
         3.00000
          395.500000
    
    
      max
        28.000000
        28.500000
        67.00000
       100000.000000



In [78]:

    
data.replace('?', np.nan, inplace = True)



In [79]:

    
data.A16.replace({'+':1, '-':0}, inplace = True)



In [80]:

    
data.head()



In [81]:

    
## fill A1
a_count = data.A1[data.A1 == 'a'].count()
b_count = data.A1[data.A1 == 'b'].count()
# %b
float(b_count)/(a_count + b_count)









    Out[81]:





0.6902654867256637



In [82]:

    
# figure out how many null values are in A1
data[data.A1.isnull()]
#data[data.A1.isnull()].count()
# ex. for column A2, there are 12 results in A1 that are null



In [83]:

    
# fill null values in A1 based on % distribution of a and b
# size = 12 bc that's the number of null values in A1
A1_fillna = np.random.choice(('a', 'b'), size = 12, p = (.31, .69))



In [84]:

    
data.loc[data.A1.isnull(), 'A1'] = A1_fillna



In [85]:

    
## fill A2
# they're all numbers but listed as objects --> change to numbers
data.A2 = data.A2.astype(float)
# get mean and std dev so we can fill null values
print 'Mean: ' , data.A2.mean()
print 'Std dev: ', data.A2.std()









    



Mean:  31.5681710914
Std dev:  11.9578624983



In [86]:

    
# create a function that will impute the null values with values within 1 std dev of the mean
def get_A2_impute_values(n):
    return np.random.normal(data.A2.mean(), data.A2.std(), n)



In [87]:

    
data.loc[data.A2.isnull(), 'A2'] = get_A2_impute_values(12)



In [88]:

    
## null values in A4

data[data.A4.isnull()]
#these 6 people have a lot of columns with missing data



In [89]:

    
# if we remove them, will it be a very significant change?
approved_count = data.A16[data.A16 == 1].count()
notapproved_count = data.A16[data.A16 == 0].count()

print float(approved_count-4)/approved_count
print float(notapproved_count-2)/notapproved_count
# based on the percentages for when we remove these it appears it is not
#a huge deal to remove them









    



0.986970684039
0.994778067885



In [90]:

    
# drop all rows where value in A4 is null
data.dropna(subset=['A4'], how='all', inplace = True)



In [91]:

    
## null values in A6 and 7
data[data.A6.isnull()]



In [92]:

    
sum(data.A6.value_counts())









    Out[92]:





681



In [93]:

    
data.A6.value_counts()









    Out[93]:





c     137
q      78
w      64
i      59
aa     54
ff     53
k      51
cc     41
x      38
m      38
d      30
e      25
j      10
r       3
dtype: int64



In [94]:

    
from scipy import stats

counts = data.A6.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=data.shape[0] - data.A6.count())
data.loc[data.A6.isnull(), "A6"] = counts.iloc[fill_idxs].index.values

data.A6.value_counts()









    Out[94]:





c     137
q      79
w      64
i      59
aa     55
ff     53
k      51
cc     41
x      38
m      38
d      30
e      26
j      10
r       3
dtype: int64



In [95]:

    
data[data.A7.isnull()]



In [96]:

    
data.A7.value_counts()









    Out[96]:





v     399
h     138
bb     59
ff     57
z       8
j       8
dd      6
n       4
o       2
dtype: int64



In [97]:

    
# based on the distribution we see, we're just going to take the mode again
counts = data.A7.value_counts()
dist = stats.rv_discrete(values=(np.arange(counts.shape[0]), 
                                 counts/counts.sum()))
fill_idxs = dist.rvs(size=data.shape[0] - data.A7.count())
data.loc[data.A7.isnull(), "A7"] = counts.iloc[fill_idxs].index.values

data.A7.value_counts()









    Out[97]:





v     400
h     138
bb     60
ff     58
z       8
j       8
dd      6
n       4
o       2
dtype: int64



In [98]:

    
data.info()
# based on this, we determine the only remaining column with null values is A14









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Data columns (total 16 columns):
A1     684 non-null object
A2     684 non-null float64
A3     684 non-null float64
A4     684 non-null object
A5     684 non-null object
A6     684 non-null object
A7     684 non-null object
A8     684 non-null float64
A9     684 non-null object
A10    684 non-null object
A11    684 non-null int64
A12    684 non-null object
A13    684 non-null object
A14    677 non-null object
A15    684 non-null int64
A16    684 non-null int64
dtypes: float64(3), int64(3), object(10)
memory usage: 90.8+ KB



In [99]:

    
data[data.A14.isnull()]



In [100]:

    
data.A14.value_counts()









    Out[100]:





00000    132
00200     35
00120     35
00160     34
00080     30
00100     30
00280     22
00180     18
00140     16
00240     14
00320     14
00300     13
00260     11
00220      9
00400      9
...
00980    1
00256    1
00470    1
00263    1
00431    1
00375    1
00434    1
00372    1
00276    1
00371    1
00075    1
00432    1
00333    1
00519    1
00017    1
Length: 170, dtype: int64



In [101]:

    
# looks like we're going to use the mode again...
data.loc[data.A14.isnull(), 'A14'] = [data.A14.mode()]
data.A14.value_counts()









    Out[101]:





00000    139
00200     35
00120     35
00160     34
00080     30
00100     30
00280     22
00180     18
00140     16
00240     14
00320     14
00300     13
00260     11
00220      9
00400      9
...
00980    1
00256    1
00470    1
00263    1
00431    1
00375    1
00434    1
00372    1
00276    1
00371    1
00075    1
00432    1
00333    1
00519    1
00017    1
Length: 170, dtype: int64



In [102]:

    
data.describe()









    Out[102]:






  
    
      
      A2
      A3
      A8
      A11
      A15
      A16
    
  
  
    
      count
       684.000000
       684.000000
       684.000000
       684.000000
          684.000000
       684.000000
    
    
      mean
        31.489777
         4.800468
         2.242909
         2.421053
         1026.309942
         0.442982
    
    
      std
        11.975893
         4.979873
         3.354657
         4.879025
         5232.060836
         0.497102
    
    
      min
        -3.499798
         0.000000
         0.000000
         0.000000
            0.000000
         0.000000
    
    
      25%
        22.580000
         1.030000
         0.165000
         0.000000
            0.000000
         0.000000
    
    
      50%
        28.500000
         2.855000
         1.000000
         0.000000
            5.000000
         0.000000
    
    
      75%
        38.250000
         7.500000
         2.720000
         3.000000
          400.000000
         1.000000
    
    
      max
        80.250000
        28.000000
        28.500000
        67.000000
       100000.000000
         1.000000



In [103]:

    
'''
Now is probably the time to split this into train/test...
I got a little overwhelmed with: 
1. When I would add dummies & how that might impact things
2. If you add dummies and then split, how do you do exploratory plots?

I guess you could split into a theoretical training set and then later
add my dummies to the original whole dataset and train-test-split to do
the modeling...
'''









    Out[103]:





'\nNow is probably the time to split this into train/test...\nI got a little overwhelmed with: \n1. When I would add dummies & how that might impact things\n2. If you add dummies and then split, how do you do exploratory plots?\n\nI guess you could split into a theoretical training set and then later\nadd my dummies to the original whole dataset and train-test-split to do\nthe modeling...\n'



In [104]:

    
## PLOT STUFF TO SEE WHAT IT LOOKS LIKE

data.A2.hist(bins = 50)









    Out[104]:





<matplotlib.axes._subplots.AxesSubplot at 0x10cb42350>



In [105]:

    
data.A1.value_counts().plot(kind = 'bar')









    Out[105]:





<matplotlib.axes._subplots.AxesSubplot at 0x10cc2b1d0>



In [106]:

    
data.A4.value_counts().plot(kind = 'bar')









    Out[106]:





<matplotlib.axes._subplots.AxesSubplot at 0x10cd69b50>



In [107]:

    
data.A5.value_counts().plot(kind = 'bar')









    Out[107]:





<matplotlib.axes._subplots.AxesSubplot at 0x10cf6ced0>



In [108]:

    
data.A6.value_counts().plot(kind = 'bar')









    Out[108]:





<matplotlib.axes._subplots.AxesSubplot at 0x10d081490>



In [109]:

    
data.A7.value_counts().plot(kind = 'bar')









    Out[109]:





<matplotlib.axes._subplots.AxesSubplot at 0x10d21e310>



In [110]:

    
data.A8.value_counts().plot(kind = 'bar')









    Out[110]:





<matplotlib.axes._subplots.AxesSubplot at 0x10d32f2d0>



In [111]:

    
data.A9.value_counts().plot(kind = 'bar')









    Out[111]:





<matplotlib.axes._subplots.AxesSubplot at 0x10d7cff10>



In [112]:

    
data.A10.value_counts().plot(kind = 'bar')









    Out[112]:





<matplotlib.axes._subplots.AxesSubplot at 0x10e9db0d0>



In [113]:

    
data.A11.value_counts().plot(kind = 'bar')









    Out[113]:





<matplotlib.axes._subplots.AxesSubplot at 0x10de01910>



In [114]:

    
data.A12.value_counts().plot(kind = 'bar')









    Out[114]:





<matplotlib.axes._subplots.AxesSubplot at 0x10ed23f10>



In [115]:

    
data.A13.value_counts().plot(kind = 'bar')









    Out[115]:





<matplotlib.axes._subplots.AxesSubplot at 0x10eda6150>



In [116]:

    
data.A14.value_counts().plot(kind = 'bar')









    Out[116]:





<matplotlib.axes._subplots.AxesSubplot at 0x10efc0b50>



In [117]:

    
data.A15.value_counts().plot(kind = 'bar')









    Out[117]:





<matplotlib.axes._subplots.AxesSubplot at 0x10f209590>



In [118]:

    
data.groupby(['A4'])['A16'].mean()









    Out[118]:





A4
l     1.000000
u     0.493256
y     0.276074
Name: A16, dtype: float64



In [119]:

    
data.hist(bins = 50, figsize = (15,15));



In [120]:

    
data.corr()



In [121]:

    
from pandas.tools.plotting import scatter_matrix



In [122]:

    
scatter_matrix(data, figsize = (20,20));



In [123]:

    
sns.factorplot("A1", hue = "A4", data = data, kind = "bar", palette = "Greens_d", size = 5);



In [124]:

    
sns.factorplot("A1", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5);



In [125]:

    
sns.factorplot("A4", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5);



In [126]:

    
sns.factorplot("A9", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5)









    Out[126]:





<seaborn.axisgrid.FacetGrid at 0x116fba610>



In [127]:

    
sns.factorplot("A10", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5)









    Out[127]:





<seaborn.axisgrid.FacetGrid at 0x1170a1690>



In [128]:

    
sns.factorplot("A12", hue = "A16", data = data, kind = "bar", palette = "Greens_d", size = 5)









    Out[128]:





<seaborn.axisgrid.FacetGrid at 0x116368f90>



In [129]:

    
sns.factorplot("A9", hue = "A10", data = data, kind = "bar", palette = "Greens_d", size = 5)









    Out[129]:





<seaborn.axisgrid.FacetGrid at 0x1119d3690>



In [130]:

    
## DUMMIES

dummy = pd.get_dummies(data)
dummy.drop('A16', axis = 1, inplace = True)



In [131]:

    
dummy.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 684 entries, 0 to 689
Columns: 215 entries, A2 to A14_02000
dtypes: float64(213), int64(2)
memory usage: 1.1 MB



In [132]:

    
Target = data.A16.values



In [133]:

    
Features = dummy



In [134]:

    
## TRAIN TEST SPLIT
f_train, f_test, t_train, t_test = train_test_split(Features,
                                                    Target, random_state = 3, 
                                                    test_size = .3)



In [135]:

    
## LOGISTIC REGRESSION

lr = LogisticRegression()



In [136]:

    
lr.fit(f_train, t_train)









    Out[136]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)



In [137]:

    
lr.score(f_test, t_test)









    Out[137]:





0.81067961165048541



In [138]:

    
## CONFUSION MATRIX

t_pred = lr.predict(f_test)



In [139]:

    
confusion_matrix(t_test, t_pred)









    Out[139]:





array([[89, 23],
       [16, 78]])



In [140]:

    
print classification_report(t_test, t_pred)









    



             precision    recall  f1-score   support

          0       0.85      0.79      0.82       112
          1       0.77      0.83      0.80        94

avg / total       0.81      0.81      0.81       206



In [141]:

    
## SVM

est = LinearSVC(C=1e+1)



In [142]:

    
est.fit(f_train, t_train)









    Out[142]:





LinearSVC(C=10.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)



In [143]:

    
est.score(f_test, t_test) 
#the score is consistently in the .7 to .8 range when the split is .2 instead of .3









    Out[143]:





0.57766990291262132



In [144]:

    
## GRID SEARCH
d = {}
d['C'] = np.logspace(-3,-3,10)



In [145]:

    
gs = GridSearchCV(LinearSVC(),d)
gs.fit(f_train, t_train)









    Out[145]:





GridSearchCV(cv=None,
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([ 0.001,  0.001,  0.001,  0.001,  0.001,  0.001,  0.001,  0.001,
        0.001,  0.001])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [146]:

    
gs.best_params_, gs.best_score_









    Out[146]:





({'C': 0.001}, 0.81380753138075312)



In [147]:

    
gs.score(f_train, t_train)









    Out[147]:





0.82635983263598323



In [148]:

    
t_pred = gs.predict(f_test)



In [149]:

    
confusion_matrix(t_test, t_pred)









    Out[149]:





array([[101,  11],
       [ 27,  67]])



In [150]:

    
print classification_report(t_test, t_pred)









    



             precision    recall  f1-score   support

          0       0.79      0.90      0.84       112
          1       0.86      0.71      0.78        94

avg / total       0.82      0.82      0.81       206



In [151]:

    
## SVC - the black box that is the nonlinear kernel

svc = SVC()



In [152]:

    
svc.fit(f_train, t_train)









    Out[152]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [153]:

    
svc.score(f_train, t_train)









    Out[153]:





0.84937238493723854



In [154]:

    
t_pred = svc.predict(f_test)



In [155]:

    
confusion_matrix(t_test, t_pred)









    Out[155]:





array([[97, 15],
       [30, 64]])



In [156]:

    
print classification_report(t_test, t_pred)









    



             precision    recall  f1-score   support

          0       0.76      0.87      0.81       112
          1       0.81      0.68      0.74        94

avg / total       0.78      0.78      0.78       206



In [157]:

    
d = {}
d['C'] = np.logspace(-3,3,10)



In [158]:

    
gs = GridSearchCV(SVC(),d)
gs.fit(f_train, t_train)









    Out[158]:





GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [159]:

    
gs.best_params_, gs.best_score_









    Out[159]:





({'C': 10.0}, 0.7615062761506276)



In [160]:

    
gs.score(f_train, t_train)









    Out[160]:





0.95188284518828448



In [161]:

    
t_pred = gs.predict(f_test)



In [162]:

    
confusion_matrix(t_test, t_pred)









    Out[162]:





array([[88, 24],
       [26, 68]])



In [163]:

    
print classification_report(t_test, t_pred)









    



             precision    recall  f1-score   support

          0       0.77      0.79      0.78       112
          1       0.74      0.72      0.73        94

avg / total       0.76      0.76      0.76       206



In [164]:

    
## Using a grid search on values of C and gamma to plot the BEST decision function
param = {'C':np.logspace(-3,3,10), 'gamma' : np.logspace(-3, 3, 5)}
gs = GridSearchCV(SVC(), param)



In [165]:

    
gs.fit(f_train, t_train)









    Out[165]:





GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   4.64159e-03,   2.15443e-02,   1.00000e-01,
         4.64159e-01,   2.15443e+00,   1.00000e+01,   4.64159e+01,
         2.15443e+02,   1.00000e+03]), 'gamma': array([  1.00000e-03,   3.16228e-02,   1.00000e+00,   3.16228e+01,
         1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [166]:

    
gs.score(f_train, t_train)









    Out[166]:





0.93933054393305437



In [167]:

    
t_pred = gs.predict(f_test)



In [168]:

    
confusion_matrix(t_test, t_pred)









    Out[168]:





array([[85, 27],
       [24, 70]])



In [169]:

    
print classification_report(t_test, t_pred)









    



             precision    recall  f1-score   support

          0       0.78      0.76      0.77       112
          1       0.72      0.74      0.73        94

avg / total       0.75      0.75      0.75       206



In [169]:



In [169]:



In [ ]:

	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A11	A12	A13	A14	A15	A16
248	NaN	24.50	12.750	u	g	c	bb	4.750	t	t	2	f	g	00073	444	1
327	NaN	40.83	3.500	u	g	i	bb	0.500	f	f	0	f	s	01160	0	0
346	NaN	32.25	1.500	u	g	c	v	0.250	f	f	0	t	g	00372	122	0
374	NaN	28.17	0.585	u	g	aa	v	0.040	f	f	0	f	g	00260	1004	0
453	NaN	29.75	0.665	u	g	w	v	0.250	f	f	0	t	g	00300	0	0
479	NaN	26.50	2.710	y	p	NaN	NaN	0.085	f	f	0	f	s	00080	0	0
489	NaN	45.33	1.000	u	g	q	v	0.125	f	f	0	t	g	00263	0	0
520	NaN	20.42	7.500	u	g	k	v	1.500	t	t	1	f	g	00160	234	1
598	NaN	20.08	0.125	u	g	q	v	1.000	f	t	1	f	g	00240	768	1
601	NaN	42.25	1.750	y	p	NaN	NaN	0.000	f	f	0	t	g	00150	1	0
641	NaN	33.17	2.250	y	p	cc	v	3.500	f	f	0	t	g	00200	141	0
673	NaN	29.50	2.000	y	p	e	h	2.000	f	f	0	f	g	00256	17	0

	A1	A2	A4	A5	A6	A7	A9	A10	A12	A13	A14	A16
206	a	71.58	NaN	NaN	NaN	NaN	f	f	f	p	NaN	1
270	b	37.58	NaN	NaN	NaN	NaN	f	f	f	p	NaN	1
330	b	20.42	NaN	NaN	NaN	NaN	f	f	f	p	NaN	0
456	b	34.58	NaN	NaN	NaN	NaN	f	f	f	p	NaN	0
592	b	23.17	NaN	NaN	NaN	NaN	f	f	f	p	NaN	1
622	a	25.58	NaN	NaN	NaN	NaN	f	f	f	p	NaN	1

	A2	A3	A8	A11	A15	A16
A2	1.000000	0.204907	0.393249	0.187415	0.019778	0.149974
A3	0.204907	1.000000	0.295079	0.268428	0.121992	0.211902
A8	0.393249	0.295079	1.000000	0.320413	0.050311	0.327280
A11	0.187415	0.268428	0.320413	1.000000	0.062924	0.410751
A15	0.019778	0.121992	0.050311	0.062924	1.000000	0.177302
A16	0.149974	0.211902	0.327280	0.410751	0.177302	1.000000

	A3	A8	A11	A15
count	690.000000	690.000000	690.00000	690.000000
mean	4.758725	2.223406	2.40000	1017.385507
std	4.978163	3.346513	4.86294	5210.102598
min	0.000000	0.000000	0.00000	0.000000
25%	1.000000	0.165000	0.00000	0.000000
50%	2.750000	1.000000	0.00000	5.000000
75%	7.207500	2.625000	3.00000	395.500000
max	28.000000	28.500000	67.00000	100000.000000

	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A11	A12	A13	A14	A15	A16
0	b	30.83	0.000	u	g	w	v	1.25	t	t	1	f	g	00202	0	1
1	a	58.67	4.460	u	g	q	h	3.04	t	t	6	f	g	00043	560	1
2	a	24.50	0.500	u	g	q	h	1.50	t	f	0	f	g	00280	824	1
3	b	27.83	1.540	u	g	w	v	3.75	t	t	5	t	g	00100	3	1
4	b	20.17	5.625	u	g	w	v	1.71	t	f	0	f	s	00120	0	1

	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A12	A13	A14	A15
479	a	26.50	2.71	y	p	NaN	NaN	0.085	f	f	f	s	00080	0
539	b	80.25	5.50	u	g	NaN	NaN	0.540	t	f	f	g	00000	340
601	b	42.25	1.75	y	p	NaN	NaN	0.000	f	f	t	g	00150	1

	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A11	A12	A13	A14	A15	A16
71	b	34.830000	4.000	u	g	d	bb	12.500	t	f	0	t	g	NaN	0	0
202	b	24.830000	2.750	u	g	c	v	2.250	t	t	6	f	g	NaN	600	1
243	a	18.750000	7.500	u	g	q	v	2.710	t	t	5	f	g	NaN	26726	1
278	b	24.580000	13.500	y	p	ff	ff	0.000	f	f	0	f	g	NaN	0	0
406	a	40.330000	8.125	y	p	k	v	0.165	f	t	2	f	g	NaN	18	0
445	a	37.430226	11.250	u	g	ff	ff	0.000	f	f	0	f	g	NaN	5200	0
626	b	22.000000	7.835	y	p	i	bb	0.165	f	f	0	t	g	NaN	0	0

	A2	A3	A8	A11	A15	A16
count	684.000000	684.000000	684.000000	684.000000	684.000000	684.000000
mean	31.489777	4.800468	2.242909	2.421053	1026.309942	0.442982
std	11.975893	4.979873	3.354657	4.879025	5232.060836	0.497102
min	-3.499798	0.000000	0.000000	0.000000	0.000000	0.000000
25%	22.580000	1.030000	0.165000	0.000000	0.000000	0.000000
50%	28.500000	2.855000	1.000000	0.000000	5.000000	0.000000
75%	38.250000	7.500000	2.720000	3.000000	400.000000	1.000000
max	80.250000	28.000000	28.500000	67.000000	100000.000000	1.000000