In [13]:
pwd


Out[13]:
u'C:\\Users\\Wasit\\Documents\\GitHub\\book_pae\\wasit\\scikit'

In [14]:
import pandas as pd
import numpy as np
df= pd.read_pickle("../df_m20/df5.pkl")
df=df[df['COURSEID']=='CS213']
#df["COURSEID"].value_counts()
#df=pd.read_csv('../df_m20/df_CS213.csv',index_col=0)
df=df.iloc[np.random.permutation(len(df))]
df


Out[14]:
STUDENTID COURSEID GRADE TERM AN201 AS171 AS177 AS178 AT207 AT316 ... TU110 TU111 TU115 TU116 TU120 TU122 TU130 TU153 TU154 TU156
7611 351662 CS213 9 25541 0 0 0 0 0 0 ... 9 0 0 0 11 0 0 0 7 0
13455 361085 CS213 7 25541 0 0 0 0 0 0 ... 0 0 0 0 13 0 0 0 5 0
11635 354653 CS213 6 25541 0 0 0 0 0 0 ... 0 0 0 0 7 0 0 0 9 0
20164 393365 CS213 10 25561 0 0 0 0 0 0 ... 8 0 0 0 0 0 0 0 11 0
21455 397398 CS213 11 25561 0 0 0 0 0 0 ... 10 0 0 0 0 0 0 0 13 0
8170 351673 CS213 10 25541 0 0 0 0 0 0 ... 11 0 0 0 11 0 0 0 7 0
19364 383172 CS213 9 25551 0 0 0 0 0 0 ... 9 0 0 0 11 0 0 0 6 0
13231 361077 CS213 8 25541 0 0 0 0 0 0 ... 0 0 0 0 11 0 0 0 9 0
10051 351717 CS213 9 25552 0 0 0 0 0 0 ... 8 0 0 0 13 12 10 0 10 0
15330 379777 CS213 9 25551 0 0 0 0 0 0 ... 8 0 0 0 0 0 0 0 12 0
5478 336752 CS213 8 25531 0 0 0 0 0 0 ... 0 0 0 0 10 0 0 0 7 0
19317 383171 CS213 10 25551 0 0 0 0 0 0 ... 0 0 0 0 11 0 0 0 10 0
354 324842 CS213 6 25542 0 0 0 0 0 12 ... 10 0 0 0 10 0 8 0 7 0
20884 397378 CS213 10 25561 0 0 0 0 0 0 ... 7 0 0 0 0 0 0 0 9 0
20706 397374 CS213 11 25561 0 0 0 0 0 0 ... 10 0 0 0 10 0 0 0 9 0
26210 423611 CS213 7 25571 0 0 0 0 0 0 ... 13 0 0 0 0 0 11 0 10 0
3180 329235 CS213 10 25542 0 0 0 0 0 11 ... 7 0 0 0 0 9 9 0 8 0
171 321849 CS213 8 25531 0 0 0 0 0 0 ... 10 0 0 0 0 0 9 0 10 0
9366 351700 CS213 10 25541 0 0 0 0 0 0 ... 11 0 0 0 11 0 0 0 9 0
22699 398022 CS213 7 25561 0 0 0 0 0 0 ... 0 0 0 0 12 0 0 0 7 0
16047 379796 CS213 13 25551 0 0 0 0 0 0 ... 9 0 0 0 10 7 0 0 9 0
13715 361096 CS213 9 25552 0 0 0 0 0 0 ... 10 0 0 0 12 0 10 0 5 0
29513 444145 CS213 7 25571 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2903 329230 CS213 11 25531 0 0 0 0 0 0 ... 7 0 0 0 9 0 12 0 10 0
3437 336701 CS213 10 25552 0 0 0 0 0 0 ... 9 0 0 0 11 0 9 0 9 0
18866 383159 CS213 11 25551 0 0 0 0 0 0 ... 8 0 0 0 11 0 0 0 7 0
25342 423580 CS213 6 25571 0 0 0 0 0 0 ... 10 0 0 0 11 9 10 0 7 0
19929 392361 CS213 12 25561 0 0 0 0 0 0 ... 0 0 0 0 13 0 0 0 13 0
25154 423574 CS213 6 25571 0 0 0 0 0 0 ... 9 0 0 0 10 0 9 0 13 0
14888 379764 CS213 6 25552 0 0 0 0 0 0 ... 10 0 0 0 10 0 0 0 9 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
837 329181 CS213 13 25531 0 0 0 0 0 0 ... 10 0 0 0 11 0 12 0 13 0
283 324840 CS213 7 25532 0 0 0 0 0 0 ... 13 0 0 0 10 0 10 0 7 0
9026 351692 CS213 11 25552 0 0 0 0 0 11 ... 9 0 0 0 11 0 9 0 9 0
24806 415070 CS213 6 25571 0 0 0 0 0 0 ... 10 0 0 0 9 0 9 0 7 0
21683 397404 CS213 11 25561 0 0 0 0 0 0 ... 9 0 0 0 0 8 0 0 10 0
13491 361086 CS213 13 25541 0 0 0 0 0 0 ... 0 0 0 0 11 0 0 0 13 0
15003 379768 CS213 7 25551 0 0 0 0 0 0 ... 8 0 0 0 10 0 0 12 8 0
16410 379807 CS213 8 25551 0 0 0 0 0 0 ... 7 0 0 0 0 0 0 0 9 0
14524 379757 CS213 9 25551 0 0 0 0 0 0 ... 10 0 0 0 12 8 0 0 9 0
23881 402951 CS213 7 25561 0 0 0 0 0 0 ... 0 0 0 0 13 0 0 0 8 0
23710 402947 CS213 6 25561 0 0 0 0 0 0 ... 0 0 0 0 13 0 0 0 10 0
17097 380302 CS213 9 25551 0 0 0 0 0 0 ... 8 0 0 0 12 0 0 0 9 0
8523 351681 CS213 9 25541 0 0 0 0 0 0 ... 9 0 0 0 11 0 0 0 7 0
23962 402953 CS213 6 25561 0 0 0 0 0 0 ... 0 0 0 0 13 0 0 0 9 0
14250 369353 CS213 7 25551 0 0 0 0 0 11 ... 10 0 0 0 0 0 0 0 5 0
1210 329188 CS213 9 25551 0 0 0 0 0 0 ... 8 0 0 0 8 0 0 0 9 0
2659 329222 CS213 9 25531 0 0 0 0 0 0 ... 10 0 0 0 9 0 0 0 9 0
28313 427191 CS213 6 25571 0 0 0 0 0 0 ... 0 0 0 0 10 0 0 0 8 0
12091 354666 CS213 10 25552 0 0 0 0 0 0 ... 9 0 0 0 12 0 11 0 6 0
12666 361061 CS213 9 25541 0 0 0 0 0 0 ... 0 0 0 0 9 0 0 0 9 0
17384 380309 CS213 10 25562 0 0 0 0 0 0 ... 7 0 0 0 13 0 8 0 5 0
22274 398008 CS213 8 25561 0 0 0 0 0 0 ... 0 0 0 0 13 0 0 0 10 0
8849 351688 CS213 5 25552 0 0 0 0 0 0 ... 5 0 0 0 11 0 0 0 7 0
765 329179 CS213 5 25531 0 0 0 0 0 0 ... 9 0 0 0 10 0 0 0 8 0
20402 397367 CS213 9 25561 0 0 0 0 0 0 ... 7 0 0 0 0 0 0 0 7 0
18646 383153 CS213 10 25571 0 0 0 0 0 0 ... 8 0 0 0 12 0 8 0 6 0
12801 361064 CS213 7 25541 0 0 0 0 0 0 ... 0 0 0 0 12 0 0 0 7 0
13862 368308 CS213 13 25551 0 0 0 0 0 0 ... 8 0 0 0 11 8 0 0 10 0
26849 423635 CS213 6 25571 0 5 0 0 0 0 ... 7 0 0 0 9 0 9 0 9 0
28657 427206 CS213 5 25571 0 0 0 0 0 0 ... 0 0 0 0 10 0 0 0 7 0

765 rows × 199 columns


In [15]:
x_head=df.columns[3:]
y_head=['GRADE']
print x_head
print y_head
x=df.as_matrix(x_head)
y=df.as_matrix(y_head).T[0]


Index([u'TERM', u'AN201', u'AS171', u'AS177', u'AS178', u'AT207', u'AT316',
       u'AT326', u'AT336', u'AT346',
       ...
       u'TU110', u'TU111', u'TU115', u'TU116', u'TU120', u'TU122', u'TU130',
       u'TU153', u'TU154', u'TU156'],
      dtype='object', length=196)
['GRADE']

In [16]:
y


Out[16]:
array([ 9,  7,  6, 10, 11, 10,  9,  8,  9,  9,  8, 10,  6, 10, 11,  7, 10,
        8, 10,  7, 13,  9,  7, 11, 10, 11,  6, 12,  6,  6, 12, 10,  6,  8,
       12,  6, 11,  6, 11,  8,  9,  9,  9,  6,  7,  9, 11,  6,  9,  5,  8,
        7,  5,  5,  9,  7, 11, 10,  9,  9, 10,  8,  9,  6, 10, 10,  9, 10,
        9,  5,  9,  7,  8,  6,  9,  9, 13,  8,  9,  9,  9, 11,  7, 10,  7,
       11, 10, 12, 10, 12,  8,  8,  6, 11,  9, 10, 10,  6,  9,  8,  6, 11,
        8,  9,  8,  7,  8,  9, 11, 13, 11,  8, 12,  9,  9,  8, 10,  9,  8,
        8,  8,  8, 13,  8, 11,  6,  9,  9,  8, 10,  9,  5, 11,  7,  7,  6,
       13,  6,  6,  9,  7,  8, 10,  9,  7,  7,  7,  9, 12, 10, 13,  9,  9,
       10, 10,  7,  9,  6,  9, 11,  9, 11,  8, 11,  9, 10,  9, 10,  6,  9,
       13,  9,  9,  9,  5,  5,  8,  9, 11,  6,  6,  7, 10,  7,  9,  9, 13,
        9, 10,  8,  8,  5,  5,  6,  9,  9, 12, 10, 10, 10,  9, 12,  9,  6,
       10, 10,  9, 11,  6,  6,  9,  9, 11,  6,  9,  9,  9,  6,  9, 10, 10,
       11, 11,  8,  9,  7, 10,  6,  7,  5, 10,  9,  9,  9, 11,  5, 12, 12,
       10,  9,  7, 10, 11,  9,  9, 11, 11,  5, 10, 12, 12,  6,  7,  9,  7,
        8,  7, 13, 12,  8, 13,  9,  5,  9,  7,  9, 10,  8, 13, 10,  8,  9,
        8,  6,  8,  9,  8,  8,  9,  8, 13, 13,  7, 11,  8,  9,  9, 10, 11,
        7,  9,  9,  9, 10,  7,  7,  9,  8, 10, 13, 10,  7, 12, 11,  9,  9,
       13,  6,  7,  7,  7, 10,  8,  5,  7, 12,  7,  7, 12, 10,  8, 12,  8,
        7,  6,  7,  8,  9,  8,  6,  8,  9, 13,  8,  9, 12,  9, 13,  7, 12,
        6, 12, 10, 12,  5,  8,  5, 10,  9,  7,  9, 10,  6,  6, 11,  9,  6,
        8,  8,  6,  7, 10,  9,  7, 11,  9, 12,  6,  6,  7,  6,  5, 11,  8,
       13, 10, 11, 10,  8,  9,  9, 11,  9,  9, 10,  9, 11,  9,  9, 10,  6,
       12,  8, 12,  9,  6,  5,  9, 11,  8,  9,  9,  5,  9, 11,  9,  5, 10,
       10,  6, 10,  8,  9,  9,  8,  9, 10,  6,  9,  6,  8,  8, 11,  8,  6,
       13, 10,  6,  6,  6,  8,  8,  6, 10,  8,  9,  9,  9,  8,  9,  8, 10,
        6,  7,  9, 10,  9,  7,  8, 13, 13,  9,  7,  9,  9,  9, 11, 11, 10,
        9,  8, 13, 10,  8, 11, 10,  8,  6,  8, 13,  8,  9, 10, 13,  6,  9,
       13,  6,  9,  8, 10,  9, 13,  6,  8, 13,  9,  9, 13,  9, 10,  8,  9,
        8, 10,  8,  6,  8,  9,  6,  7,  6, 10, 12,  5,  8,  7,  8,  7,  9,
        9,  7, 11, 10,  8, 12, 10,  6, 13, 13,  6,  8,  7,  6,  9,  9,  9,
        9, 11, 13,  9,  8,  8, 11,  6, 10,  6,  9, 13,  9,  9,  9,  6,  9,
        6,  9,  9, 10,  9, 11,  8,  8,  6,  6,  9,  9,  9, 10, 10, 11,  9,
       10,  7,  8, 10, 11, 12,  8,  9,  6,  8, 10,  9,  5,  9,  8,  9, 10,
        6, 11, 12,  8,  8, 10,  6, 10, 13,  6,  9, 11, 11,  5,  8, 13,  6,
        9, 10,  6,  5,  8,  6,  9,  8,  6, 10,  8,  7,  6, 11,  9,  9,  9,
       13,  6, 10,  8, 11,  9,  8,  8,  9,  7,  8, 13, 12,  7,  7,  8,  9,
        6,  9,  9, 13,  7,  8,  7,  9,  8,  9, 13,  9, 10,  9, 10,  9,  8,
       10,  8,  9,  6,  6, 10, 12,  7,  8, 10,  6, 10, 13,  9, 11,  6, 13,
        8,  8, 10,  7, 10,  6,  9,  7,  9,  6,  5, 10,  9,  7,  6,  7, 11,
        8,  9,  6,  9,  8,  7, 11,  8,  6,  6, 12,  9,  5, 11,  6, 11,  5,
        9,  5,  5, 10,  8,  9, 12, 10,  7,  8, 10,  9,  7, 12,  8,  6, 13,
        7,  7, 10,  9, 10,  7, 13,  9,  8,  6, 13,  7,  9,  8,  9,  9,  8,
        9,  6,  7, 10, 13,  7, 11,  6, 11, 13,  7,  8,  9,  7,  6,  9,  9,
        6,  7,  9,  9,  6, 10,  9, 10,  8,  5,  5,  9, 10,  7, 13,  6,  5], dtype=int64)

In [17]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score
clf = ExtraTreesClassifier(n_estimators=100, max_depth=None, 
    min_samples_split=1, random_state=0, max_features=None)
scores = cross_val_score(clf, x, y, cv=5)
print scores


[ 0.37579618  0.40645161  0.40131579  0.34437086  0.36      ]

In [18]:
clf.fit(x[len(y)/5:],y[len(y)/5:])
dif=[];
for i in xrange(len(y)/5):
    ga=clf.predict(x[i,:])[0]
    gp=y[i]
    dif.append(ga-gp)
    print "actual: %d, prediction: %d"%(ga,gp)


actual: 9, prediction: 9
actual: 9, prediction: 7
actual: 7, prediction: 6
actual: 9, prediction: 10
actual: 10, prediction: 11
actual: 9, prediction: 10
actual: 9, prediction: 9
actual: 9, prediction: 8
actual: 10, prediction: 9
actual: 10, prediction: 9
actual: 6, prediction: 8
actual: 11, prediction: 10
actual: 6, prediction: 6
actual: 9, prediction: 10
actual: 10, prediction: 11
actual: 7, prediction: 7
actual: 10, prediction: 10
actual: 11, prediction: 8
actual: 8, prediction: 10
actual: 7, prediction: 7
actual: 10, prediction: 13
actual: 9, prediction: 9
actual: 7, prediction: 7
actual: 11, prediction: 11
actual: 10, prediction: 10
actual: 11, prediction: 11
actual: 6, prediction: 6
actual: 11, prediction: 12
actual: 6, prediction: 6
actual: 5, prediction: 6
actual: 13, prediction: 12
actual: 11, prediction: 10
actual: 6, prediction: 6
actual: 9, prediction: 8
actual: 13, prediction: 12
actual: 5, prediction: 6
actual: 13, prediction: 11
actual: 6, prediction: 6
actual: 9, prediction: 11
actual: 7, prediction: 8
actual: 9, prediction: 9
actual: 9, prediction: 9
actual: 9, prediction: 9
actual: 9, prediction: 6
actual: 8, prediction: 7
actual: 9, prediction: 9
actual: 10, prediction: 11
actual: 6, prediction: 6
actual: 8, prediction: 9
actual: 5, prediction: 5
actual: 10, prediction: 8
actual: 6, prediction: 7
actual: 7, prediction: 5
actual: 8, prediction: 5
actual: 11, prediction: 9
actual: 10, prediction: 7
actual: 11, prediction: 11
actual: 9, prediction: 10
actual: 10, prediction: 9
actual: 9, prediction: 9
actual: 6, prediction: 10
actual: 9, prediction: 8
actual: 10, prediction: 9
actual: 9, prediction: 6
actual: 10, prediction: 10
actual: 10, prediction: 10
actual: 11, prediction: 9
actual: 7, prediction: 10
actual: 9, prediction: 9
actual: 6, prediction: 5
actual: 9, prediction: 9
actual: 9, prediction: 7
actual: 8, prediction: 8
actual: 6, prediction: 6
actual: 9, prediction: 9
actual: 5, prediction: 9
actual: 11, prediction: 13
actual: 8, prediction: 8
actual: 9, prediction: 9
actual: 9, prediction: 9
actual: 9, prediction: 9
actual: 11, prediction: 11
actual: 7, prediction: 7
actual: 10, prediction: 10
actual: 7, prediction: 7
actual: 10, prediction: 11
actual: 10, prediction: 10
actual: 10, prediction: 12
actual: 11, prediction: 10
actual: 12, prediction: 12
actual: 7, prediction: 8
actual: 7, prediction: 8
actual: 5, prediction: 6
actual: 10, prediction: 11
actual: 8, prediction: 9
actual: 10, prediction: 10
actual: 8, prediction: 10
actual: 7, prediction: 6
actual: 6, prediction: 9
actual: 9, prediction: 8
actual: 6, prediction: 6
actual: 13, prediction: 11
actual: 9, prediction: 8
actual: 9, prediction: 9
actual: 7, prediction: 8
actual: 7, prediction: 7
actual: 8, prediction: 8
actual: 9, prediction: 9
actual: 8, prediction: 11
actual: 13, prediction: 13
actual: 8, prediction: 11
actual: 9, prediction: 8
actual: 11, prediction: 12
actual: 9, prediction: 9
actual: 9, prediction: 9
actual: 7, prediction: 8
actual: 9, prediction: 10
actual: 9, prediction: 9
actual: 9, prediction: 8
actual: 8, prediction: 8
actual: 10, prediction: 8
actual: 6, prediction: 8
actual: 10, prediction: 13
actual: 7, prediction: 8
actual: 9, prediction: 11
actual: 6, prediction: 6
actual: 8, prediction: 9
actual: 8, prediction: 9
actual: 7, prediction: 8
actual: 9, prediction: 10
actual: 10, prediction: 9
actual: 8, prediction: 5
actual: 11, prediction: 11
actual: 5, prediction: 7
actual: 8, prediction: 7
actual: 8, prediction: 6
actual: 9, prediction: 13
actual: 6, prediction: 6
actual: 7, prediction: 6
actual: 9, prediction: 9
actual: 9, prediction: 7
actual: 9, prediction: 8
actual: 9, prediction: 10
actual: 9, prediction: 9
actual: 7, prediction: 7
actual: 8, prediction: 7
actual: 8, prediction: 7
actual: 9, prediction: 9
actual: 13, prediction: 12
actual: 9, prediction: 10
actual: 10, prediction: 13
actual: 5, prediction: 9
actual: 8, prediction: 9

In [19]:
df_dif=pd.DataFrame({'dif':dif})
%pylab inline
df_dif.dif.value_counts().sort_index().plot(kind='bar')


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['clf']
`%matplotlib` prevents importing * from pylab and numpy
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a365470>