In [13]:
pwd
Out[13]:
u'C:\\Users\\Wasit\\Documents\\GitHub\\book_pae\\wasit\\scikit'
In [14]:
import pandas as pd
import numpy as np
df= pd.read_pickle("../df_m20/df5.pkl")
df=df[df['COURSEID']=='CS213']
#df["COURSEID"].value_counts()
#df=pd.read_csv('../df_m20/df_CS213.csv',index_col=0)
df=df.iloc[np.random.permutation(len(df))]
df
Out[14]:
STUDENTID
COURSEID
GRADE
TERM
AN201
AS171
AS177
AS178
AT207
AT316
...
TU110
TU111
TU115
TU116
TU120
TU122
TU130
TU153
TU154
TU156
7611
351662
CS213
9
25541
0
0
0
0
0
0
...
9
0
0
0
11
0
0
0
7
0
13455
361085
CS213
7
25541
0
0
0
0
0
0
...
0
0
0
0
13
0
0
0
5
0
11635
354653
CS213
6
25541
0
0
0
0
0
0
...
0
0
0
0
7
0
0
0
9
0
20164
393365
CS213
10
25561
0
0
0
0
0
0
...
8
0
0
0
0
0
0
0
11
0
21455
397398
CS213
11
25561
0
0
0
0
0
0
...
10
0
0
0
0
0
0
0
13
0
8170
351673
CS213
10
25541
0
0
0
0
0
0
...
11
0
0
0
11
0
0
0
7
0
19364
383172
CS213
9
25551
0
0
0
0
0
0
...
9
0
0
0
11
0
0
0
6
0
13231
361077
CS213
8
25541
0
0
0
0
0
0
...
0
0
0
0
11
0
0
0
9
0
10051
351717
CS213
9
25552
0
0
0
0
0
0
...
8
0
0
0
13
12
10
0
10
0
15330
379777
CS213
9
25551
0
0
0
0
0
0
...
8
0
0
0
0
0
0
0
12
0
5478
336752
CS213
8
25531
0
0
0
0
0
0
...
0
0
0
0
10
0
0
0
7
0
19317
383171
CS213
10
25551
0
0
0
0
0
0
...
0
0
0
0
11
0
0
0
10
0
354
324842
CS213
6
25542
0
0
0
0
0
12
...
10
0
0
0
10
0
8
0
7
0
20884
397378
CS213
10
25561
0
0
0
0
0
0
...
7
0
0
0
0
0
0
0
9
0
20706
397374
CS213
11
25561
0
0
0
0
0
0
...
10
0
0
0
10
0
0
0
9
0
26210
423611
CS213
7
25571
0
0
0
0
0
0
...
13
0
0
0
0
0
11
0
10
0
3180
329235
CS213
10
25542
0
0
0
0
0
11
...
7
0
0
0
0
9
9
0
8
0
171
321849
CS213
8
25531
0
0
0
0
0
0
...
10
0
0
0
0
0
9
0
10
0
9366
351700
CS213
10
25541
0
0
0
0
0
0
...
11
0
0
0
11
0
0
0
9
0
22699
398022
CS213
7
25561
0
0
0
0
0
0
...
0
0
0
0
12
0
0
0
7
0
16047
379796
CS213
13
25551
0
0
0
0
0
0
...
9
0
0
0
10
7
0
0
9
0
13715
361096
CS213
9
25552
0
0
0
0
0
0
...
10
0
0
0
12
0
10
0
5
0
29513
444145
CS213
7
25571
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2903
329230
CS213
11
25531
0
0
0
0
0
0
...
7
0
0
0
9
0
12
0
10
0
3437
336701
CS213
10
25552
0
0
0
0
0
0
...
9
0
0
0
11
0
9
0
9
0
18866
383159
CS213
11
25551
0
0
0
0
0
0
...
8
0
0
0
11
0
0
0
7
0
25342
423580
CS213
6
25571
0
0
0
0
0
0
...
10
0
0
0
11
9
10
0
7
0
19929
392361
CS213
12
25561
0
0
0
0
0
0
...
0
0
0
0
13
0
0
0
13
0
25154
423574
CS213
6
25571
0
0
0
0
0
0
...
9
0
0
0
10
0
9
0
13
0
14888
379764
CS213
6
25552
0
0
0
0
0
0
...
10
0
0
0
10
0
0
0
9
0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
837
329181
CS213
13
25531
0
0
0
0
0
0
...
10
0
0
0
11
0
12
0
13
0
283
324840
CS213
7
25532
0
0
0
0
0
0
...
13
0
0
0
10
0
10
0
7
0
9026
351692
CS213
11
25552
0
0
0
0
0
11
...
9
0
0
0
11
0
9
0
9
0
24806
415070
CS213
6
25571
0
0
0
0
0
0
...
10
0
0
0
9
0
9
0
7
0
21683
397404
CS213
11
25561
0
0
0
0
0
0
...
9
0
0
0
0
8
0
0
10
0
13491
361086
CS213
13
25541
0
0
0
0
0
0
...
0
0
0
0
11
0
0
0
13
0
15003
379768
CS213
7
25551
0
0
0
0
0
0
...
8
0
0
0
10
0
0
12
8
0
16410
379807
CS213
8
25551
0
0
0
0
0
0
...
7
0
0
0
0
0
0
0
9
0
14524
379757
CS213
9
25551
0
0
0
0
0
0
...
10
0
0
0
12
8
0
0
9
0
23881
402951
CS213
7
25561
0
0
0
0
0
0
...
0
0
0
0
13
0
0
0
8
0
23710
402947
CS213
6
25561
0
0
0
0
0
0
...
0
0
0
0
13
0
0
0
10
0
17097
380302
CS213
9
25551
0
0
0
0
0
0
...
8
0
0
0
12
0
0
0
9
0
8523
351681
CS213
9
25541
0
0
0
0
0
0
...
9
0
0
0
11
0
0
0
7
0
23962
402953
CS213
6
25561
0
0
0
0
0
0
...
0
0
0
0
13
0
0
0
9
0
14250
369353
CS213
7
25551
0
0
0
0
0
11
...
10
0
0
0
0
0
0
0
5
0
1210
329188
CS213
9
25551
0
0
0
0
0
0
...
8
0
0
0
8
0
0
0
9
0
2659
329222
CS213
9
25531
0
0
0
0
0
0
...
10
0
0
0
9
0
0
0
9
0
28313
427191
CS213
6
25571
0
0
0
0
0
0
...
0
0
0
0
10
0
0
0
8
0
12091
354666
CS213
10
25552
0
0
0
0
0
0
...
9
0
0
0
12
0
11
0
6
0
12666
361061
CS213
9
25541
0
0
0
0
0
0
...
0
0
0
0
9
0
0
0
9
0
17384
380309
CS213
10
25562
0
0
0
0
0
0
...
7
0
0
0
13
0
8
0
5
0
22274
398008
CS213
8
25561
0
0
0
0
0
0
...
0
0
0
0
13
0
0
0
10
0
8849
351688
CS213
5
25552
0
0
0
0
0
0
...
5
0
0
0
11
0
0
0
7
0
765
329179
CS213
5
25531
0
0
0
0
0
0
...
9
0
0
0
10
0
0
0
8
0
20402
397367
CS213
9
25561
0
0
0
0
0
0
...
7
0
0
0
0
0
0
0
7
0
18646
383153
CS213
10
25571
0
0
0
0
0
0
...
8
0
0
0
12
0
8
0
6
0
12801
361064
CS213
7
25541
0
0
0
0
0
0
...
0
0
0
0
12
0
0
0
7
0
13862
368308
CS213
13
25551
0
0
0
0
0
0
...
8
0
0
0
11
8
0
0
10
0
26849
423635
CS213
6
25571
0
5
0
0
0
0
...
7
0
0
0
9
0
9
0
9
0
28657
427206
CS213
5
25571
0
0
0
0
0
0
...
0
0
0
0
10
0
0
0
7
0
765 rows × 199 columns
In [15]:
x_head=df.columns[3:]
y_head=['GRADE']
print x_head
print y_head
x=df.as_matrix(x_head)
y=df.as_matrix(y_head).T[0]
Index([u'TERM', u'AN201', u'AS171', u'AS177', u'AS178', u'AT207', u'AT316',
u'AT326', u'AT336', u'AT346',
...
u'TU110', u'TU111', u'TU115', u'TU116', u'TU120', u'TU122', u'TU130',
u'TU153', u'TU154', u'TU156'],
dtype='object', length=196)
['GRADE']
In [16]:
y
Out[16]:
array([ 9, 7, 6, 10, 11, 10, 9, 8, 9, 9, 8, 10, 6, 10, 11, 7, 10,
8, 10, 7, 13, 9, 7, 11, 10, 11, 6, 12, 6, 6, 12, 10, 6, 8,
12, 6, 11, 6, 11, 8, 9, 9, 9, 6, 7, 9, 11, 6, 9, 5, 8,
7, 5, 5, 9, 7, 11, 10, 9, 9, 10, 8, 9, 6, 10, 10, 9, 10,
9, 5, 9, 7, 8, 6, 9, 9, 13, 8, 9, 9, 9, 11, 7, 10, 7,
11, 10, 12, 10, 12, 8, 8, 6, 11, 9, 10, 10, 6, 9, 8, 6, 11,
8, 9, 8, 7, 8, 9, 11, 13, 11, 8, 12, 9, 9, 8, 10, 9, 8,
8, 8, 8, 13, 8, 11, 6, 9, 9, 8, 10, 9, 5, 11, 7, 7, 6,
13, 6, 6, 9, 7, 8, 10, 9, 7, 7, 7, 9, 12, 10, 13, 9, 9,
10, 10, 7, 9, 6, 9, 11, 9, 11, 8, 11, 9, 10, 9, 10, 6, 9,
13, 9, 9, 9, 5, 5, 8, 9, 11, 6, 6, 7, 10, 7, 9, 9, 13,
9, 10, 8, 8, 5, 5, 6, 9, 9, 12, 10, 10, 10, 9, 12, 9, 6,
10, 10, 9, 11, 6, 6, 9, 9, 11, 6, 9, 9, 9, 6, 9, 10, 10,
11, 11, 8, 9, 7, 10, 6, 7, 5, 10, 9, 9, 9, 11, 5, 12, 12,
10, 9, 7, 10, 11, 9, 9, 11, 11, 5, 10, 12, 12, 6, 7, 9, 7,
8, 7, 13, 12, 8, 13, 9, 5, 9, 7, 9, 10, 8, 13, 10, 8, 9,
8, 6, 8, 9, 8, 8, 9, 8, 13, 13, 7, 11, 8, 9, 9, 10, 11,
7, 9, 9, 9, 10, 7, 7, 9, 8, 10, 13, 10, 7, 12, 11, 9, 9,
13, 6, 7, 7, 7, 10, 8, 5, 7, 12, 7, 7, 12, 10, 8, 12, 8,
7, 6, 7, 8, 9, 8, 6, 8, 9, 13, 8, 9, 12, 9, 13, 7, 12,
6, 12, 10, 12, 5, 8, 5, 10, 9, 7, 9, 10, 6, 6, 11, 9, 6,
8, 8, 6, 7, 10, 9, 7, 11, 9, 12, 6, 6, 7, 6, 5, 11, 8,
13, 10, 11, 10, 8, 9, 9, 11, 9, 9, 10, 9, 11, 9, 9, 10, 6,
12, 8, 12, 9, 6, 5, 9, 11, 8, 9, 9, 5, 9, 11, 9, 5, 10,
10, 6, 10, 8, 9, 9, 8, 9, 10, 6, 9, 6, 8, 8, 11, 8, 6,
13, 10, 6, 6, 6, 8, 8, 6, 10, 8, 9, 9, 9, 8, 9, 8, 10,
6, 7, 9, 10, 9, 7, 8, 13, 13, 9, 7, 9, 9, 9, 11, 11, 10,
9, 8, 13, 10, 8, 11, 10, 8, 6, 8, 13, 8, 9, 10, 13, 6, 9,
13, 6, 9, 8, 10, 9, 13, 6, 8, 13, 9, 9, 13, 9, 10, 8, 9,
8, 10, 8, 6, 8, 9, 6, 7, 6, 10, 12, 5, 8, 7, 8, 7, 9,
9, 7, 11, 10, 8, 12, 10, 6, 13, 13, 6, 8, 7, 6, 9, 9, 9,
9, 11, 13, 9, 8, 8, 11, 6, 10, 6, 9, 13, 9, 9, 9, 6, 9,
6, 9, 9, 10, 9, 11, 8, 8, 6, 6, 9, 9, 9, 10, 10, 11, 9,
10, 7, 8, 10, 11, 12, 8, 9, 6, 8, 10, 9, 5, 9, 8, 9, 10,
6, 11, 12, 8, 8, 10, 6, 10, 13, 6, 9, 11, 11, 5, 8, 13, 6,
9, 10, 6, 5, 8, 6, 9, 8, 6, 10, 8, 7, 6, 11, 9, 9, 9,
13, 6, 10, 8, 11, 9, 8, 8, 9, 7, 8, 13, 12, 7, 7, 8, 9,
6, 9, 9, 13, 7, 8, 7, 9, 8, 9, 13, 9, 10, 9, 10, 9, 8,
10, 8, 9, 6, 6, 10, 12, 7, 8, 10, 6, 10, 13, 9, 11, 6, 13,
8, 8, 10, 7, 10, 6, 9, 7, 9, 6, 5, 10, 9, 7, 6, 7, 11,
8, 9, 6, 9, 8, 7, 11, 8, 6, 6, 12, 9, 5, 11, 6, 11, 5,
9, 5, 5, 10, 8, 9, 12, 10, 7, 8, 10, 9, 7, 12, 8, 6, 13,
7, 7, 10, 9, 10, 7, 13, 9, 8, 6, 13, 7, 9, 8, 9, 9, 8,
9, 6, 7, 10, 13, 7, 11, 6, 11, 13, 7, 8, 9, 7, 6, 9, 9,
6, 7, 9, 9, 6, 10, 9, 10, 8, 5, 5, 9, 10, 7, 13, 6, 5], dtype=int64)
In [17]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score
clf = ExtraTreesClassifier(n_estimators=100, max_depth=None,
min_samples_split=1, random_state=0, max_features=None)
scores = cross_val_score(clf, x, y, cv=5)
print scores
[ 0.37579618 0.40645161 0.40131579 0.34437086 0.36 ]
In [18]:
clf.fit(x[len(y)/5:],y[len(y)/5:])
dif=[];
for i in xrange(len(y)/5):
ga=clf.predict(x[i,:])[0]
gp=y[i]
dif.append(ga-gp)
print "actual: %d, prediction: %d"%(ga,gp)
actual: 9, prediction: 9
actual: 9, prediction: 7
actual: 7, prediction: 6
actual: 9, prediction: 10
actual: 10, prediction: 11
actual: 9, prediction: 10
actual: 9, prediction: 9
actual: 9, prediction: 8
actual: 10, prediction: 9
actual: 10, prediction: 9
actual: 6, prediction: 8
actual: 11, prediction: 10
actual: 6, prediction: 6
actual: 9, prediction: 10
actual: 10, prediction: 11
actual: 7, prediction: 7
actual: 10, prediction: 10
actual: 11, prediction: 8
actual: 8, prediction: 10
actual: 7, prediction: 7
actual: 10, prediction: 13
actual: 9, prediction: 9
actual: 7, prediction: 7
actual: 11, prediction: 11
actual: 10, prediction: 10
actual: 11, prediction: 11
actual: 6, prediction: 6
actual: 11, prediction: 12
actual: 6, prediction: 6
actual: 5, prediction: 6
actual: 13, prediction: 12
actual: 11, prediction: 10
actual: 6, prediction: 6
actual: 9, prediction: 8
actual: 13, prediction: 12
actual: 5, prediction: 6
actual: 13, prediction: 11
actual: 6, prediction: 6
actual: 9, prediction: 11
actual: 7, prediction: 8
actual: 9, prediction: 9
actual: 9, prediction: 9
actual: 9, prediction: 9
actual: 9, prediction: 6
actual: 8, prediction: 7
actual: 9, prediction: 9
actual: 10, prediction: 11
actual: 6, prediction: 6
actual: 8, prediction: 9
actual: 5, prediction: 5
actual: 10, prediction: 8
actual: 6, prediction: 7
actual: 7, prediction: 5
actual: 8, prediction: 5
actual: 11, prediction: 9
actual: 10, prediction: 7
actual: 11, prediction: 11
actual: 9, prediction: 10
actual: 10, prediction: 9
actual: 9, prediction: 9
actual: 6, prediction: 10
actual: 9, prediction: 8
actual: 10, prediction: 9
actual: 9, prediction: 6
actual: 10, prediction: 10
actual: 10, prediction: 10
actual: 11, prediction: 9
actual: 7, prediction: 10
actual: 9, prediction: 9
actual: 6, prediction: 5
actual: 9, prediction: 9
actual: 9, prediction: 7
actual: 8, prediction: 8
actual: 6, prediction: 6
actual: 9, prediction: 9
actual: 5, prediction: 9
actual: 11, prediction: 13
actual: 8, prediction: 8
actual: 9, prediction: 9
actual: 9, prediction: 9
actual: 9, prediction: 9
actual: 11, prediction: 11
actual: 7, prediction: 7
actual: 10, prediction: 10
actual: 7, prediction: 7
actual: 10, prediction: 11
actual: 10, prediction: 10
actual: 10, prediction: 12
actual: 11, prediction: 10
actual: 12, prediction: 12
actual: 7, prediction: 8
actual: 7, prediction: 8
actual: 5, prediction: 6
actual: 10, prediction: 11
actual: 8, prediction: 9
actual: 10, prediction: 10
actual: 8, prediction: 10
actual: 7, prediction: 6
actual: 6, prediction: 9
actual: 9, prediction: 8
actual: 6, prediction: 6
actual: 13, prediction: 11
actual: 9, prediction: 8
actual: 9, prediction: 9
actual: 7, prediction: 8
actual: 7, prediction: 7
actual: 8, prediction: 8
actual: 9, prediction: 9
actual: 8, prediction: 11
actual: 13, prediction: 13
actual: 8, prediction: 11
actual: 9, prediction: 8
actual: 11, prediction: 12
actual: 9, prediction: 9
actual: 9, prediction: 9
actual: 7, prediction: 8
actual: 9, prediction: 10
actual: 9, prediction: 9
actual: 9, prediction: 8
actual: 8, prediction: 8
actual: 10, prediction: 8
actual: 6, prediction: 8
actual: 10, prediction: 13
actual: 7, prediction: 8
actual: 9, prediction: 11
actual: 6, prediction: 6
actual: 8, prediction: 9
actual: 8, prediction: 9
actual: 7, prediction: 8
actual: 9, prediction: 10
actual: 10, prediction: 9
actual: 8, prediction: 5
actual: 11, prediction: 11
actual: 5, prediction: 7
actual: 8, prediction: 7
actual: 8, prediction: 6
actual: 9, prediction: 13
actual: 6, prediction: 6
actual: 7, prediction: 6
actual: 9, prediction: 9
actual: 9, prediction: 7
actual: 9, prediction: 8
actual: 9, prediction: 10
actual: 9, prediction: 9
actual: 7, prediction: 7
actual: 8, prediction: 7
actual: 8, prediction: 7
actual: 9, prediction: 9
actual: 13, prediction: 12
actual: 9, prediction: 10
actual: 10, prediction: 13
actual: 5, prediction: 9
actual: 8, prediction: 9
In [19]:
df_dif=pd.DataFrame({'dif':dif})
%pylab inline
df_dif.dif.value_counts().sort_index().plot(kind='bar')
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['clf']
`%matplotlib` prevents importing * from pylab and numpy
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a365470>
Content source: wasit7/book_pae
Similar notebooks: