In [1]:
import pandas as pd
import numpy as np
import pickle
import xlwt
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
In [2]:
df_file = pd.read_csv('../data/df_dropSub_less20_dropNaResult.csv',delimiter=",", skip_blank_lines = True,
error_bad_lines=False)
df_file = df_file.drop('Unnamed: 0',axis=1)
df_file = df_file.fillna(0)
df_file = df_file.replace(['A', 'B+', 'B', 'C+', 'C' , 'D+' , 'D' , 'F' , 'W' , 'S' , 'S#' , 'U' , 'U#'],
[8, 7, 7, 6 , 6, 5, 5, 4, 3, 2, 2, 1, 1])
In [3]:
df_file
Out[3]:
Unnamed: 0.1
3COURSEID
4RESULT
0STUDENTID
1ACADYEAR
2SEMESTER
AT316
AT326
BA291
CJ315
...
TA395
TH161
TU100
TU110
TU120
TU122
TU130
TU154
PROVINCEID
SCHOOLGPA
0
0
CS101
6
316644
2552
1
0
0
0
0
...
0
0
0
0
0
0
0
0
12
3.32
1
1
CS102
6
316644
2552
1
0
0
0
0
...
0
0
0
0
0
0
0
0
12
3.32
2
2
EL171
5
316644
2552
1
0
0
0
0
...
0
0
0
0
0
0
0
0
12
3.32
3
3
SC135
4
316644
2552
1
0
0
0
0
...
0
0
0
0
0
0
0
0
12
3.32
4
4
SC185
6
316644
2552
1
0
0
0
0
...
0
0
0
0
0
0
0
0
12
3.32
5
5
TH161
6
316644
2552
1
0
0
0
0
...
0
0
0
0
0
0
0
0
12
3.32
6
6
TU154
5
316644
2552
1
0
0
0
0
...
0
0
0
0
0
0
0
0
12
3.32
7
7
CS111
5
316644
2552
2
0
0
0
0
...
0
6
0
0
0
0
0
5
12
3.32
8
8
EL172
4
316644
2552
2
0
0
0
0
...
0
6
0
0
0
0
0
5
12
3.32
9
9
MA211
4
316644
2552
2
0
0
0
0
...
0
6
0
0
0
0
0
5
12
3.32
10
10
PY228
7
316644
2552
2
0
0
0
0
...
0
6
0
0
0
0
0
5
12
3.32
11
11
TU110
6
316644
2552
2
0
0
0
0
...
0
6
0
0
0
0
0
5
12
3.32
12
12
TU120
5
316644
2552
2
0
0
0
0
...
0
6
0
0
0
0
0
5
12
3.32
13
13
TU130
7
316644
2552
2
0
0
0
0
...
0
6
0
0
0
0
0
5
12
3.32
14
14
TU122
7
316644
2552
3
0
0
0
0
...
0
6
0
6
5
0
7
5
12
3.32
15
15
AT326
8
316644
2553
1
0
0
0
0
...
0
6
0
6
5
7
7
5
12
3.32
16
16
CS213
6
316644
2553
1
0
0
0
0
...
0
6
0
6
5
7
7
5
12
3.32
17
17
CS214
7
316644
2553
1
0
0
0
0
...
0
6
0
6
5
7
7
5
12
3.32
18
18
CS222
7
316644
2553
1
0
0
0
0
...
0
6
0
6
5
7
7
5
12
3.32
19
19
CS223
7
316644
2553
1
0
0
0
0
...
0
6
0
6
5
7
7
5
12
3.32
20
20
CS284
7
316644
2553
1
0
0
0
0
...
0
6
0
6
5
7
7
5
12
3.32
21
21
MA211
5
316644
2553
1
0
0
0
0
...
0
6
0
6
5
7
7
5
12
3.32
22
22
SW111
5
316644
2553
1
0
0
0
0
...
0
6
0
6
5
7
7
5
12
3.32
23
23
AT316
7
316644
2553
2
0
8
0
0
...
0
6
0
6
5
7
7
5
12
3.32
24
24
CS251
6
316644
2553
2
0
8
0
0
...
0
6
0
6
5
7
7
5
12
3.32
25
25
CS261
7
316644
2553
2
0
8
0
0
...
0
6
0
6
5
7
7
5
12
3.32
26
26
CS281
7
316644
2553
2
0
8
0
0
...
0
6
0
6
5
7
7
5
12
3.32
27
27
MA332
6
316644
2553
2
0
8
0
0
...
0
6
0
6
5
7
7
5
12
3.32
28
28
SC135
6
316644
2553
2
0
8
0
0
...
0
6
0
6
5
7
7
5
12
3.32
29
29
ST216
6
316644
2553
2
0
8
0
0
...
0
6
0
6
5
7
7
5
12
3.32
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
27965
31292
EL070
2
447240
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
48
3.75
27966
31293
MA211
4
447240
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
48
3.75
27967
31294
ST216
4
447240
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
48
3.75
27968
31295
TH161
6
447240
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
48
3.75
27969
31296
TU154
4
447240
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
48
3.75
27970
31297
CS101
5
447241
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
13
2.60
27971
31298
CS102
5
447241
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
13
2.60
27972
31299
CS105
5
447241
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
13
2.60
27973
31300
EL070
2
447241
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
13
2.60
27974
31301
MA211
3
447241
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
13
2.60
27975
31302
ST216
3
447241
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
13
2.60
27976
31303
TH161
5
447241
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
13
2.60
27977
31304
TU154
3
447241
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
13
2.60
27978
31313
CS101
5
447242
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
34
2.93
27979
31314
CS102
5
447242
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
34
2.93
27980
31315
CS105
5
447242
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
34
2.93
27981
31316
EL171
3
447242
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
34
2.93
27982
31317
MA211
3
447242
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
34
2.93
27983
31318
ST216
3
447242
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
34
2.93
27984
31319
TH161
6
447242
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
34
2.93
27985
31320
TU154
5
447242
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
34
2.93
27986
31325
SC185
3
447242
2557
2
0
0
0
0
...
0
6
0
0
0
0
0
5
34
2.93
27987
31329
CS101
4
447243
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
84
2.08
27988
31330
CS102
4
447243
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
84
2.08
27989
31331
CS105
4
447243
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
84
2.08
27990
31332
EL070
1
447243
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
84
2.08
27991
31333
MA211
4
447243
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
84
2.08
27992
31334
ST216
4
447243
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
84
2.08
27993
31335
TH161
4
447243
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
84
2.08
27994
31336
TU154
4
447243
2557
1
0
0
0
0
...
0
0
0
0
0
0
0
0
84
2.08
27995 rows × 118 columns
In [4]:
count_courseId = df_file["3COURSEID"].value_counts()
more20 = count_courseId
headers=list(df_file.columns.values)
subjects = []
countSub = 0
more20
Out[4]:
MA211 1068
CS102 966
CS101 948
TU154 946
TH161 899
CS111 834
CS213 765
EL171 760
SC135 740
PY228 705
EL172 697
SC185 686
TU110 666
CS223 664
TU120 648
ST216 623
CS284 616
EL295 588
TU130 567
MA212 539
MA332 509
CS314 509
CS222 490
CS214 488
CS261 481
CS251 471
CS281 470
CS341 443
EL395 442
CS301 421
...
CS286 53
CS297 53
CS429 51
CS446 50
SW335 50
CS356 49
HR201 49
CS459 48
NS132 44
SO201 44
TA395 42
CJ321 41
CS397 41
CS398 40
CS348 38
CJ317 37
CJ316 36
MW313 36
MA216 35
CS407 35
CS115 34
CS457 32
CS388 31
CS449 30
CS426 30
CS408 27
CJ315 26
CS399 24
CS285 24
CS231 1
dtype: int64
In [5]:
count = 0
subjects.sort()
precision_rf={}
df_precision = more20.drop('CS231').copy() #Create to add new data with score precision
list_allsub = df_file.columns[4:]
allSubject_df = pd.DataFrame(columns=[subjects],index=[list_allsub])
top10_df = pd.DataFrame(columns=[subjects])
In [6]:
len(df_precision)
Out[6]:
110
In [7]:
headers=list(df_file.columns.values)
subjects = []
countSub = 0
#Create dictionary of list subjects
for sub in df_file[headers[1]]:
if sub not in subjects:
subjects.append(sub)
countSub = countSub+1
In [25]:
subjects.sort()
for subject in subjects:
#Create new Dataframe
#subject = 'MA211'
#print subject
df_sub = df_file[df_file['3COURSEID'] == subject]
df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
count_enrollment = df_sub['3COURSEID'].value_counts()
#print "Number of %s enrollment: %s"%(subject,count_enrollment)
A = df_sub.as_matrix()
X = A[:,6:117]
X = X.astype(np.int64, copy=False)
y = A[:,2]
y = y.astype(np.int64, copy=False)
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)
forest = RandomForestClassifier(n_estimators=10, max_depth=None,
min_samples_split=1, random_state=None, max_features=None)
clf = forest.fit(X_train,y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
print "Accuracy of %s: %s"%(subject,accuracy)
print "Precision of %s: %s"%(subject,precision)
print "Recall of %s: %s"%(subject,recall)
print "------------------------------"
# precision_rf[subject] = scores.mean()
# df_precision.loc[subject]=precision_rf[subject]
Accuracy of AT316: 0.51724137931
Precision of AT316: [ 0.58333333 0.2 ]
Recall of AT316: [ 0.77777778 0.09090909]
------------------------------
Accuracy of AT326: 0.688888888889
Precision of AT326: [ 1. 0.73529412 0.5 ]
Recall of AT326: [ 0.5 0.83333333 0.38461538]
------------------------------
Accuracy of BA291: 0.333333333333
Precision of BA291: [ 0. 0.2 0.42857143 0. ]
Recall of BA291: [ 0. 0.2 0.6 0. ]
------------------------------
Accuracy of CJ315: 0.166666666667
Precision of CJ315: [ 0. 1. 0.]
Recall of CJ315: [ 0. 0.2 0. ]
------------------------------
Accuracy of CJ316: 0.75
Precision of CJ316: [ 0. 0.71428571 1. ]
Recall of CJ316: [ 0. 1. 0.5]
------------------------------
Accuracy of CJ317: 0.75
Precision of CJ317: [ 0. 0.75 1. ]
Recall of CJ317: [ 0. 0.75 0.75]
------------------------------
Accuracy of CJ321: 0.555555555556
Precision of CJ321: [ 0. 0.5 1. ]
Recall of CJ321: [ 0. 1. 0.25]
------------------------------
Accuracy of CS101: 0.457894736842
Precision of CS101: [ 0. 0. 0.05882353 0.50292398 0. 0. ]
Recall of CS101: [ 0. 0. 0.04166667 0.88659794 0. 0. ]
------------------------------
Accuracy of CS102: 0.329896907216
Precision of CS102: [ 0. 0. 0.23529412 0.35820896 0.33333333 0. ]
Recall of CS102: [ 0. 0. 0.17021277 0.70588235 0.15686275 0. ]
------------------------------
Accuracy of CS105: 0.233333333333
Precision of CS105: [ 0. 0. 0.25 0.125 0.42857143 0. ]
Recall of CS105: [ 0. 0. 0.625 0.06666667 0.2 0. ]
------------------------------
Accuracy of CS111: 0.395209580838
Precision of CS111: [ 0.33333333 0.42105263 0.32 0.37096774 0.4375 0.81818182]
Recall of CS111: [ 0.27272727 0.25806452 0.41025641 0.42592593 0.46666667 0.52941176]
------------------------------
Accuracy of CS115: 0.571428571429
Precision of CS115: [ 0. 0.33333333 0. 1. ]
Recall of CS115: [ 0. 0.5 0. 1. ]
------------------------------
Accuracy of CS211: 0.444444444444
Precision of CS211: [ 0.5 0.375 0.53846154 0.33333333 0. ]
Recall of CS211: [ 0.5 0.375 0.58333333 0.25 0. ]
------------------------------
Accuracy of CS213: 0.509803921569
Precision of CS213: [ 0. 0.4 0.59459459 0.59677419 0.21428571 0.57142857]
Recall of CS213: [ 0. 0.85714286 0.44 0.61666667 0.1875 0.57142857]
------------------------------
Accuracy of CS214: 0.428571428571
Precision of CS214: [ 0. 0. 0.33333333 0.48 0.38888889]
Recall of CS214: [ 0. 0. 0.25 0.48 0.5 ]
------------------------------
Accuracy of CS215: 0.1875
Precision of CS215: [ 0. 0. 0.33333333 0.5 0. ]
Recall of CS215: [ 0. 0. 0.4 0.25 0. ]
------------------------------
Accuracy of CS222: 0.602040816327
Precision of CS222: [ 0. 0. 0.35294118 0.65454545 0.7 0.75 ]
Recall of CS222: [ 0. 0. 0.33333333 0.76595745 0.58333333 0.5 ]
------------------------------
Accuracy of CS223: 0.563909774436
Precision of CS223: [ 0. 0. 0.1 0.66666667 0.48148148 0.2 ]
Recall of CS223: [ 0. 0. 0.07142857 0.75949367 0.5 0.14285714]
------------------------------
C:\Anaconda\lib\site-packages\sklearn\metrics\classification.py:958: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
'precision', 'predicted', average, warn_for)
C:\Anaconda\lib\site-packages\sklearn\metrics\classification.py:960: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
'recall', 'true', average, warn_for)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-25-ec6a61de32eb> in <module>()
20 forest = RandomForestClassifier(n_estimators=10, max_depth=None,
21 min_samples_split=1, random_state=None, max_features=None)
---> 22 clf = forest.fit(X_train,y_train)
23 y_pred = clf.predict(X_test)
24
C:\Anaconda\lib\site-packages\sklearn\ensemble\forest.pyc in fit(self, X, y, sample_weight)
193 """
194 # Validate or convert input data
--> 195 X = check_array(X, dtype=DTYPE, accept_sparse="csc")
196 if issparse(X):
197 # Pre-sort indices to avoid that each individual tree of the
C:\Anaconda\lib\site-packages\sklearn\utils\validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features)
358 raise ValueError("Found array with %d sample(s) (shape=%s) while a"
359 " minimum of %d is required."
--> 360 % (n_samples, shape_repr, ensure_min_samples))
361
362 if ensure_min_features > 0 and array.ndim == 2:
ValueError: Found array with 0 sample(s) (shape=(0, 111)) while a minimum of 1 is required.
In [14]:
X_train
Out[14]:
array([[ 0, 0, 0, ..., 0, 0, 80],
[ 0, 0, 0, ..., 0, 0, 1],
[ 0, 0, 0, ..., 0, 0, 36],
...,
[ 0, 0, 0, ..., 0, 0, 1],
[ 0, 0, 0, ..., 0, 0, 12],
[ 0, 0, 0, ..., 0, 0, 30]], dtype=int64)
In [ ]:
Content source: wasit7/book_pae
Similar notebooks: