Create Confusion Matrix

1. Load dataset(csv)
2. Split dataset to X_train,X_test,y_train,y_test
3. Run classifier, by .fit(X_train,y_train)
4. Impact on the results, by .predict(X_test)
5. Import confusion_matrix and Create-> cm = confusion_matrix(y_test, y_pred)

In [1]:
import pandas as pd
import numpy as np
import pickle
import xlwt
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
df_file = pd.read_csv('../data/df_dropSub_less20_dropNaResult.csv',delimiter=",", skip_blank_lines = True, 
                 error_bad_lines=False)
df_file = df_file.drop('Unnamed: 0',axis=1)
df_file = df_file.fillna(0)
df_file = df_file.replace(['A', 'B+', 'B', 'C+', 'C' , 'D+' , 'D' , 'F' , 'W' , 'S' , 'S#' , 'U' , 'U#'], 
                     [8, 7, 7, 6 , 6, 5, 5, 4, 3, 2, 2, 1, 1])

In [3]:
df_file


Out[3]:
Unnamed: 0.1 3COURSEID 4RESULT 0STUDENTID 1ACADYEAR 2SEMESTER AT316 AT326 BA291 CJ315 ... TA395 TH161 TU100 TU110 TU120 TU122 TU130 TU154 PROVINCEID SCHOOLGPA
0 0 CS101 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
1 1 CS102 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
2 2 EL171 5 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
3 3 SC135 4 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
4 4 SC185 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
5 5 TH161 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
6 6 TU154 5 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
7 7 CS111 5 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
8 8 EL172 4 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
9 9 MA211 4 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
10 10 PY228 7 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
11 11 TU110 6 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
12 12 TU120 5 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
13 13 TU130 7 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
14 14 TU122 7 316644 2552 3 0 0 0 0 ... 0 6 0 6 5 0 7 5 12 3.32
15 15 AT326 8 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
16 16 CS213 6 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
17 17 CS214 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
18 18 CS222 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
19 19 CS223 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
20 20 CS284 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
21 21 MA211 5 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
22 22 SW111 5 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
23 23 AT316 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
24 24 CS251 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
25 25 CS261 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
26 26 CS281 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
27 27 MA332 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
28 28 SC135 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
29 29 ST216 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27965 31292 EL070 2 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27966 31293 MA211 4 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27967 31294 ST216 4 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27968 31295 TH161 6 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27969 31296 TU154 4 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27970 31297 CS101 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27971 31298 CS102 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27972 31299 CS105 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27973 31300 EL070 2 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27974 31301 MA211 3 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27975 31302 ST216 3 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27976 31303 TH161 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27977 31304 TU154 3 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27978 31313 CS101 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27979 31314 CS102 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27980 31315 CS105 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27981 31316 EL171 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27982 31317 MA211 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27983 31318 ST216 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27984 31319 TH161 6 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27985 31320 TU154 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27986 31325 SC185 3 447242 2557 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 34 2.93
27987 31329 CS101 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27988 31330 CS102 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27989 31331 CS105 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27990 31332 EL070 1 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27991 31333 MA211 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27992 31334 ST216 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27993 31335 TH161 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27994 31336 TU154 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08

27995 rows × 118 columns


In [4]:
count_courseId = df_file["3COURSEID"].value_counts() 
more20 = count_courseId

headers=list(df_file.columns.values)
subjects = []
countSub = 0

In [5]:
count = 0
subjects.sort()
precision_rf={}
df_precision = more20.drop('CS231').copy()

list_allsub = df_file.columns[4:]
allSubject_df = pd.DataFrame(columns=[subjects],index=[list_allsub])
top10_df = pd.DataFrame(columns=[subjects])

In [6]:
headers=list(df_file.columns.values)
subjects = []
countSub = 0
#Create dictionary of list subjects
for sub in df_file[headers[1]]:
    if sub not in subjects:
        subjects.append(sub)
        countSub = countSub+1

In [7]:
subjects.sort()

In [8]:
subjects.remove('CS231')

In [9]:
len(subjects)


Out[9]:
110

In [10]:
subject = 'CS401'
df_sub = df_file[df_file['3COURSEID'] == subject]
df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
count_enrollment = df_sub['3COURSEID'].value_counts()
#print "Number of %s enrollment: %s"%(subject,count_enrollment)

A = df_sub.as_matrix()
X = A[:,6:116]
X = X.astype(np.int64, copy=False)
y = A[:,2]
y = y.astype(np.int64, copy=False)

In [11]:
subject


Out[11]:
'CS401'

In [12]:
X


Out[12]:
array([[7, 8, 0, ..., 0, 6, 5],
       [7, 7, 0, ..., 0, 6, 7],
       [0, 0, 7, ..., 0, 6, 5],
       ..., 
       [0, 0, 0, ..., 0, 5, 4],
       [7, 7, 6, ..., 0, 6, 6],
       [8, 7, 0, ..., 5, 5, 3]], dtype=int64)

In [13]:
X.shape


Out[13]:
(333L, 110L)

In [14]:
y.shape


Out[14]:
(333L,)

In [15]:
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)

In [16]:
X_train


Out[16]:
array([[8, 8, 0, ..., 7, 6, 6],
       [0, 0, 0, ..., 0, 7, 6],
       [0, 0, 0, ..., 0, 6, 6],
       ..., 
       [0, 7, 0, ..., 5, 7, 6],
       [7, 7, 0, ..., 8, 6, 5],
       [7, 8, 0, ..., 7, 7, 5]], dtype=int64)

In [17]:
X.shape


Out[17]:
(333L, 110L)

In [18]:
X_test


Out[18]:
array([[0, 7, 0, ..., 7, 6, 5],
       [0, 0, 0, ..., 0, 6, 5],
       [0, 0, 0, ..., 0, 8, 8],
       ..., 
       [0, 0, 0, ..., 0, 6, 5],
       [7, 7, 0, ..., 4, 6, 7],
       [0, 0, 0, ..., 0, 5, 5]], dtype=int64)

In [19]:
X_test.shape


Out[19]:
(67L, 110L)

In [20]:
X_train.shape


Out[20]:
(266L, 110L)

In [21]:
y_train


Out[21]:
array([8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 7, 8, 8, 7, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 7, 8, 7, 8, 7, 7, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 6, 8, 8, 8, 7, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 8, 7, 7, 8,
       8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 7, 8, 7, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 7, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 7, 8, 6, 7, 8, 7, 8,
       8, 7, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 7,
       8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 8, 8,
       8, 8, 8, 7, 7, 8, 8, 8, 8, 8, 7, 8, 8, 7, 8, 7, 8, 8, 7, 8, 8, 8, 8,
       8, 8, 7, 8, 7, 7, 7, 6, 8, 7, 8, 8, 6, 8, 7, 8, 7, 8, 8, 8, 8, 7, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 8, 7, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 7, 8, 7, 8, 8, 8, 7, 8], dtype=int64)

In [22]:
y_train.shape


Out[22]:
(266L,)

In [23]:
y_test


Out[23]:
array([8, 7, 8, 8, 8, 8, 8, 8, 7, 8, 7, 7, 8, 7, 8, 8, 8, 8, 6, 7, 8, 8, 7,
       8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 8, 8, 7, 7, 8, 7, 8, 8, 8,
       8, 8, 8, 7, 8, 8, 7, 7, 8, 8, 8, 8, 8, 8, 7, 8, 8, 7, 8, 8, 8], dtype=int64)

In [24]:
y_test.shape


Out[24]:
(67L,)

In [25]:
forest = RandomForestClassifier(n_estimators=10, max_depth=None, 
            min_samples_split=1, random_state=None, max_features=None)
clf = forest.fit(X_train, y_train)

In [26]:
y_pred = clf.predict(X_test)

In [27]:
y_pred.shape


Out[27]:
(67L,)

In [28]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()


Confusion matrix, without normalization
[[ 0  0  1]
 [ 0  4 13]
 [ 0  3 46]]
Out[28]:
<matplotlib.figure.Figure at 0x17fb52e8>

In [29]:
y_pred


Out[29]:
array([8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 8, 8, 8,
       8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], dtype=int64)

In [30]:
y_test


Out[30]:
array([8, 7, 8, 8, 8, 8, 8, 8, 7, 8, 7, 7, 8, 7, 8, 8, 8, 8, 6, 7, 8, 8, 7,
       8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 8, 8, 7, 7, 8, 7, 8, 8, 8,
       8, 8, 8, 7, 8, 8, 7, 7, 8, 8, 8, 8, 8, 8, 7, 8, 8, 7, 8, 8, 8], dtype=int64)

In [31]:
Grade = ['A', 'B', 'C' , 'D' , 'F' , 'W' , 'S' , 'U' ,'na']
row = []
for cls in y_train:
    if cls not in row:
        row.append(cls)
row.sort()
print row

row_cm = []
for i in xrange(len(row)):
    Grade = ['A', 'B', 'C' , 'D' , 'F' , 'W' , 'S' , 'U' ,'na']
    grade = Grade[::-1][row[i]]
    print grade
    row_cm.append(grade)
print row_cm


[6, 7, 8]
C
B
A
['C', 'B', 'A']

In [32]:
row_cm


Out[32]:
['C', 'B', 'A']

In [55]:
def plot_confusion_matrix(cm, title=subject, cmap=plt.cm.Blues):
   
    for y in range(cm.shape[0]):
        for x in range(cm.shape[1]):
            plt.text(x, y, '%.1f' % cm[y, x])
  
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(row_cm))
    plt.xticks(tick_marks, row_cm, rotation=45)
    plt.yticks(tick_marks, row_cm)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    
    plt.pcolor(cm, cmap=cmap)
    plt.yticks(np.arange(len(row_cm)),row_cm,label="#Training Data")
    plt.xticks(np.arange(len(row_cm)),row_cm,label='#Class')
    plt.colorbar()

In [56]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix')
print(row_cm)
print(cm)
plt.figure()
plot_confusion_matrix(cm)


Confusion matrix
['C', 'B', 'A']
[[ 0  0  1]
 [ 0  4 13]
 [ 0  3 46]]

In [57]:
cm


Out[57]:
array([[ 0,  0,  1],
       [ 0,  4, 13],
       [ 0,  3, 46]])

In [58]:
plt.show()

Confusion Matrix All Subjects

(has people enroll but no grade result)


In [46]:
for subject in subjects:
    #Create new Dataframe
    
    #print subject             
    df_sub = df_file[df_file['3COURSEID'] == subject]
    df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
    count_enrollment = df_sub['3COURSEID'].value_counts()
    #print "Number of %s enrollment: %s"%(subject,count_enrollment)

    A = df_sub.as_matrix()
    X = A[:,6:117]
    X = X.astype(np.int64, copy=False)
    y = A[:,2]
    y = y.astype(np.int64, copy=False)
    
    # Split the data into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)
    
    forest = RandomForestClassifier(n_estimators=10, max_depth=None, 
            min_samples_split=1, random_state=None, max_features=None)
    clf = forest.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion matrix of %s'%subject)
    print(cm)
    plt.figure()
    
    #plot_confusion_matrix(cm)


Confusion matrix of AT316
[[ 0  1  0]
 [ 0 14  1]
 [ 0 10  3]]
Confusion matrix of AT326
[[ 0  1  0]
 [ 0 16  8]
 [ 0 13  7]]
Confusion matrix of BA291
[[4 3 0]
 [3 1 0]
 [0 1 0]]
Confusion matrix of CJ315
[[0 0 0]
 [0 4 0]
 [1 1 0]]
Confusion matrix of CJ316
[[0 1 0]
 [0 4 2]
 [0 1 0]]
Confusion matrix of CJ317
[[4 2]
 [1 1]]
Confusion matrix of CJ321
[[7 0]
 [2 0]]
Confusion matrix of CS101
[[ 0  0  0  7  0  0]
 [ 0  0  1  5  1  0]
 [ 0  2  2 15  1  0]
 [ 0  0 11 85  6  0]
 [ 0  2  5 35  6  0]
 [ 0  1  0  5  0  0]]
Confusion matrix of CS102
[[ 0  0  0  2  1  0]
 [ 0  0  0  4  4  0]
 [ 0  0  4 32  5  0]
 [ 0  0 10 64 19  1]
 [ 0  0  4 21  9  0]
 [ 0  1  1  9  3  0]]
Confusion matrix of CS105
[[1 0 3 2 0 0]
 [1 0 0 1 0 0]
 [3 2 9 3 2 0]
 [1 1 7 4 1 0]
 [2 0 7 4 1 1]
 [0 0 2 0 2 0]]
Confusion matrix of CS111
[[ 2  4  4  2  0  0]
 [ 2  7  5  5  0  0]
 [ 1 13 17 18  1  0]
 [ 3  6 10 23  7  0]
 [ 0  1  2 10  5  4]
 [ 0  0  1  6  3  5]]
Confusion matrix of CS115
[[1 1 0]
 [1 0 0]
 [0 1 3]]
Confusion matrix of CS211
[[0 0 0 0 0]
 [1 1 5 1 0]
 [1 4 8 1 0]
 [0 0 1 2 0]
 [0 0 0 2 0]]
Confusion matrix of CS213
[[ 1  1  0  2  0  0]
 [ 3 14  7  1  1  0]
 [ 1  6 19 12  1  0]
 [ 0  1 13 38  6  0]
 [ 0  0  1  7  5  2]
 [ 0  0  0  4  5  2]]
Confusion matrix of CS214
[[ 0  0  3  0]
 [ 0  1 11  0]
 [ 0  9 29  8]
 [ 1  5 14 17]]
Confusion matrix of CS215
[[1 0 1 0 0]
 [0 0 0 0 0]
 [1 0 2 1 0]
 [1 1 0 0 1]
 [2 0 2 0 3]]
Confusion matrix of CS222
[[ 0  0  1  2  0  0]
 [ 1  1  1  1  0  0]
 [ 2  0  4  8  1  0]
 [ 0  1  8 32  8  1]
 [ 0  0  2  8  8  4]
 [ 0  0  0  1  1  2]]
Confusion matrix of CS223
[[ 0  0  1  2  0  0]
 [ 0  0  0  1  0  0]
 [ 0  0  4 11  1  0]
 [ 2  0 17 53  9  0]
 [ 1  0  0 15 11  2]
 [ 0  0  0  0  3  0]]
Confusion matrix of CS251
[[ 1  0  0  0  0  0]
 [ 0  0  1  1  0  0]
 [ 1  0  3 10  0  0]
 [ 0  0  7 35  5  0]
 [ 0  0  0 11 14  1]
 [ 0  0  1  0  2  2]]
Confusion matrix of CS261
[[ 0  0  2  0  0  0]
 [ 0  0  1  1  1  0]
 [ 0  0  6  1  0  0]
 [ 0  0 10 23  7  0]
 [ 0  0  1 11 21  3]
 [ 0  0  0  1  6  2]]
Confusion matrix of CS281
[[ 0  0  1  1  0  0]
 [ 0  0  1  0  0  0]
 [ 0  0  7  7  0  0]
 [ 0  0  4 43  4  0]
 [ 0  0  0 17  6  0]
 [ 0  0  0  2  1  0]]
Confusion matrix of CS284
[[ 0  0  1  3  0  0]
 [ 0  0  2  1  0  0]
 [ 0  0 11  8  0  0]
 [ 0  1 13 37  7  0]
 [ 0  0  6 17  6  2]
 [ 0  0  0  3  3  3]]
Confusion matrix of CS285
[[1 1 0]
 [0 0 0]
 [1 0 2]]
Confusion matrix of CS286
[[1 1 0]
 [2 4 1]
 [0 2 0]]
Confusion matrix of CS288
[[ 0  4  1  0]
 [ 1 10  1  1]
 [ 0  4  2  0]
 [ 1  1  1  1]]
Confusion matrix of CS289
[[ 0  2  0  0]
 [ 0  5  9  0]
 [ 0  2 12  2]
 [ 0  0  3  2]]
Confusion matrix of CS295
[[1 0 0 0 0 0]
 [0 0 2 0 0 0]
 [1 0 2 1 1 0]
 [0 0 3 1 0 0]
 [0 0 1 0 0 0]
 [1 0 0 0 1 0]]
Confusion matrix of CS296
[[0 0 0 0 1 0]
 [0 0 0 1 0 0]
 [0 0 0 0 1 1]
 [0 0 0 3 3 0]
 [0 0 0 2 5 1]
 [0 0 0 3 1 1]]
Confusion matrix of CS297
[[2 0 0]
 [1 4 1]
 [0 2 1]]
Confusion matrix of CS300
[[3 5]
 [2 3]]
Confusion matrix of CS301
[[ 0  1  1  0]
 [ 1  3 12  1]
 [ 0  6 34  5]
 [ 0  1 14  6]]
Confusion matrix of CS302
[[ 0  0  2  0  0]
 [ 0  1  1  0  0]
 [ 0  2 25  3  0]
 [ 0  0 13 15  1]
 [ 0  0  0  2  3]]
Confusion matrix of CS311
[[ 0  0  1  2  0  0]
 [ 0  0  1  2  0  0]
 [ 0  0  3 11  1  0]
 [ 1  0  7 18  3  1]
 [ 0  0  0  5  4  1]
 [ 0  0  0  1  1  6]]
Confusion matrix of CS314
[[ 2  0  2  5  1  2]
 [ 0  3  1  1  0  0]
 [ 6  0  7 12  1  0]
 [ 4  1 16 12  6  0]
 [ 2  1  0  4  7  0]
 [ 0  0  1  0  4  1]]
Confusion matrix of CS326
[[ 0  0  1  0]
 [ 0  4  3  1]
 [ 0  0 11  1]
 [ 0  1  6  1]]
Confusion matrix of CS341
[[ 0  0  1  0  0]
 [ 0  1  5  0  0]
 [ 0  4 51  3  0]
 [ 0  1  6  9  0]
 [ 0  0  2  3  3]]
Confusion matrix of CS342
[[ 1  6  0  0]
 [ 3 27  7  0]
 [ 0  7 12  0]
 [ 0  1  3  0]]
Confusion matrix of CS348
[[0 0 0 0 0]
 [1 0 0 0 0]
 [0 0 5 0 0]
 [0 0 1 0 0]
 [0 0 1 0 0]]
Confusion matrix of CS356
[[0 0 1]
 [0 2 1]
 [0 3 3]]
Confusion matrix of CS365
[[ 1  0  0  0  0]
 [ 1  0  1  1  0]
 [ 0  2 13  8  0]
 [ 0  1  1 12  0]
 [ 0  0  0  5  2]]
Confusion matrix of CS366
[[0 0 1 0]
 [1 3 2 0]
 [0 1 5 0]
 [0 0 2 0]]
Confusion matrix of CS367
[[0 1 0 0]
 [0 3 1 1]
 [0 6 5 0]
 [0 0 2 1]]
Confusion matrix of CS374
[[ 0  0  1  0  0]
 [ 0  1  6  0  0]
 [ 0  3 14 18  0]
 [ 0  0  7  8  2]
 [ 0  0  0  6  2]]
Confusion matrix of CS377
[[0 1 0 0]
 [0 1 2 0]
 [0 0 4 1]
 [0 0 4 1]]
Confusion matrix of CS385
[[ 0  1  0  0]
 [ 0 10  6  0]
 [ 0  3 13  2]
 [ 0  0  2  0]]
Confusion matrix of CS386
[[0 0 1 0]
 [1 0 2 1]
 [0 1 6 2]
 [0 1 0 1]]
Confusion matrix of CS387
[[0 0 0 0]
 [1 1 3 1]
 [0 0 3 1]
 [0 0 0 1]]
Confusion matrix of CS388
[[0 0 0]
 [0 4 0]
 [1 0 2]]
Confusion matrix of CS395
[[ 0  0  0  2  0  0]
 [ 1  0  2  0  0  0]
 [ 0  0 11  5  0  0]
 [ 0  1  7 11  1  0]
 [ 0  0  0  4  3  1]
 [ 0  0  0  0  3  0]]
Confusion matrix of CS396
[[0 0 0 1]
 [0 1 2 0]
 [0 0 4 0]
 [0 0 1 3]]
Confusion matrix of CS397
[[2 1 0]
 [2 2 0]
 [0 1 1]]
Confusion matrix of CS398
[[0 0 0 0]
 [0 1 1 0]
 [0 1 0 0]
 [1 3 1 0]]
Confusion matrix of CS399
[[0 2]
 [1 2]]
Confusion matrix of CS401
[[ 0  1  1]
 [ 0  1 12]
 [ 0  5 47]]
Confusion matrix of CS402
[[ 1  0  0  0  0]
 [ 1  0  1  1  0]
 [ 0  0  3  2  2]
 [ 0  0  0  3  4]
 [ 0  1  2  2 27]]
Confusion matrix of CS407
[[3 0]
 [2 2]]
Confusion matrix of CS408
[[4 2]
 [0 0]]
Confusion matrix of CS409
[[0 1 0 1]
 [0 0 0 1]
 [0 1 3 5]
 [0 2 2 1]]
Confusion matrix of CS426
[[0 1 0]
 [0 1 0]
 [0 3 1]]
Confusion matrix of CS427
[[0 1 0 0]
 [0 2 1 1]
 [0 1 2 1]
 [0 1 2 0]]
Confusion matrix of CS429
[[0 0 0]
 [2 5 0]
 [0 3 1]]
Confusion matrix of CS446
[[0 0 0]
 [3 3 0]
 [0 2 2]]
Confusion matrix of CS449
[[6]]
Confusion matrix of CS456
[[1 1 0 0]
 [0 0 1 0]
 [0 4 2 0]
 [0 0 2 3]]
Confusion matrix of CS457
[[0 1 0]
 [0 0 2]
 [0 1 3]]
Confusion matrix of CS459
[[0 2 0]
 [1 5 0]
 [0 0 2]]
Confusion matrix of CS467
[[0 0 1 0]
 [0 2 2 0]
 [0 1 4 1]
 [0 1 2 0]]
Confusion matrix of CS486
[[0 1 0 0]
 [0 5 4 1]
 [0 2 1 0]
 [0 0 3 0]]
Confusion matrix of CS487
[[ 0  0  1  0]
 [ 0  0  1  1]
 [ 0  0  4  4]
 [ 0  0  3 14]]
Confusion matrix of CS488
[[13  1]
 [ 5  3]]
Confusion matrix of CS489
[[0 0 0 2 0]
 [0 0 1 1 0]
 [0 1 7 3 0]
 [0 0 1 9 0]
 [0 0 0 1 0]]
Confusion matrix of EL070
[[ 0  4  0]
 [ 0 41  0]
 [ 0  1  0]]
Confusion matrix of EL171
[[ 0  0  1  5  2  0]
 [ 0  0  1  3  0  0]
 [ 0  0 11 32  5  1]
 [ 4  0  8 26  7  0]
 [ 1  1  8 31  1  0]
 [ 1  0  0  3  0  0]]
Confusion matrix of EL172
[[ 0  1  1  2  1  0]
 [ 0  0  0  2  0  0]
 [ 0  1 13 15  0  0]
 [ 0  0 17 39 10  0]
 [ 2  0  1 12 17  1]
 [ 0  1  0  1  3  0]]
Confusion matrix of EL295
[[ 0  0  0  2  2  0]
 [ 0  0  0  1  0  0]
 [ 0  0  6  7  0  0]
 [ 0  0  4 36  6  1]
 [ 0  0  1 14 19  4]
 [ 0  0  0  1 10  4]]
Confusion matrix of EL395
[[ 6  6  2  1]
 [ 6 14  3  3]
 [ 1 13 15  7]
 [ 0  1  5  6]]
Confusion matrix of ES356
[[0 0 1 0]
 [0 1 3 0]
 [0 0 5 0]
 [0 0 2 0]]
Confusion matrix of HO201
[[ 0  0  0  0  1  0]
 [ 1  0  0  0  2  0]
 [ 0  0  0  0  1  0]
 [ 0  0  0  2  1  0]
 [ 0  0  0  3 25  7]
 [ 0  0  0  1  6  2]]
Confusion matrix of HR201
[[6 1]
 [2 1]]
Confusion matrix of LA209
[[3 0 1 0]
 [0 1 0 0]
 [0 3 3 0]
 [0 2 1 0]]
Confusion matrix of MA211
[[56  4  8  5  0  0]
 [12  0  6  1  1  0]
 [18  5 13  9  2  0]
 [22  1 14 14  5  0]
 [ 1  0  1  4  3  2]
 [ 1  0  1  2  2  1]]
Confusion matrix of MA212
[[ 9  0  4  4  1  0]
 [ 0  0  0  2  0  0]
 [ 6  2  9  5  0  0]
 [ 6  0 14 14  2  3]
 [ 2  0  1  6  3  5]
 [ 2  0  0  4  0  4]]
Confusion matrix of MA216
[[4 1 0]
 [0 0 0]
 [1 0 1]]
Confusion matrix of MA332
[[ 7  0  6  2  0  0]
 [ 2  0  2  0  0  0]
 [ 4  2 12  7  0  0]
 [ 5  0 11 19  4  0]
 [ 1  0  3  9  3  0]
 [ 0  0  0  2  0  1]]
Confusion matrix of MW313
[[5 0]
 [3 0]]
Confusion matrix of MW314
[[7 2]
 [3 0]]
Confusion matrix of NS132
[[0 0 1 0]
 [0 0 0 0]
 [0 0 5 0]
 [0 1 2 0]]
Confusion matrix of PY228
[[ 0  0  0  1  1  0]
 [ 0  1  1  1  0  0]
 [ 0  0  0 10  4  2]
 [ 4  1  4 54  9  3]
 [ 1  0  2 15  9  2]
 [ 1  0  1  9  1  4]]
Confusion matrix of SC123
[[1 1 0 0 1 0]
 [0 1 0 0 0 0]
 [0 0 5 2 0 0]
 [1 2 7 2 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]]
Confusion matrix of SC135
[[ 2  3  9 10  1  1]
 [ 0  0  4  6  1  0]
 [ 2  6 14 17  0  0]
 [ 1  1 15 23  7  1]
 [ 1  0  2  8  3  1]
 [ 0  0  4  4  0  1]]
Confusion matrix of SC173
[[ 1  0  1  2  0]
 [ 0  0  1  0  0]
 [ 0  0 13  0  0]
 [ 0  0  6  0  0]
 [ 0  0  1  0  0]]
Confusion matrix of SC185
[[ 1  0  1  3  0  0]
 [ 0  0  0  1  0  0]
 [ 1  1  1  6  3  0]
 [ 1  3  5 57 18  0]
 [ 1  0  0 25  4  1]
 [ 0  0  0  3  2  0]]
Confusion matrix of SO201
[[0 0 1 0]
 [0 0 2 0]
 [0 1 4 0]
 [0 0 0 1]]
Confusion matrix of ST216
[[ 8  1  3  4  0  0]
 [ 0  1  0  1  1  0]
 [ 3  4 12 16  0  0]
 [ 2  4  9 16  7  0]
 [ 1  1  2 16  4  4]
 [ 1  0  2  1  1  0]]
Confusion matrix of SW111
[[0 0 0 0]
 [0 4 4 0]
 [1 3 6 0]
 [0 0 3 0]]
Confusion matrix of SW212
[[ 0  0  1  0]
 [ 0  0  2  0]
 [ 0  1 12  1]
 [ 0  0  0  0]]
Confusion matrix of SW213
[[0 0 0]
 [1 7 1]
 [0 4 0]]
Confusion matrix of SW221
[[9 4]
 [4 0]]
Confusion matrix of SW335
[[0 0 1 0]
 [0 0 0 1]
 [0 0 3 2]
 [0 0 1 2]]
Confusion matrix of SW365
[[0 0 1 0]
 [0 0 3 1]
 [0 0 9 2]
 [0 0 1 2]]
Confusion matrix of SW475
[[7 0]
 [5 0]]
Confusion matrix of SW478
[[0 1 0]
 [0 8 2]
 [0 2 0]]
Confusion matrix of TA395
[[0 0 0 1]
 [0 0 1 0]
 [0 0 2 1]
 [0 0 1 3]]
Confusion matrix of TH161
[[ 0  0  0  1  3  0]
 [ 0  0  0  1  3  0]
 [ 0  0  0  1 14  0]
 [ 0  1  0  9 44  1]
 [ 0  2  0 15 70  3]
 [ 0  1  0  4  7  0]]
Confusion matrix of TU100
[[ 0  1  0  4  0]
 [ 0  0  0  2  0]
 [ 1  0  0  1  0]
 [ 2  0  0 17  4]
 [ 0  0  0  6  4]]
Confusion matrix of TU110
[[ 0  0  5  1  0]
 [ 1  0  2  3  0]
 [ 0  2 35 15  1]
 [ 0  0 24 21  6]
 [ 1  0  6  5  6]]
Confusion matrix of TU120
[[ 0  0  1  1  2  0]
 [ 0  0  0  0  2  0]
 [ 0  0  6  7  3  0]
 [ 0  0  3 13 10  2]
 [ 1  2  7  8 50  2]
 [ 1  1  1  2  5  0]]
Confusion matrix of TU122
[[0 0 0 0 1 0]
 [0 0 0 1 0 0]
 [1 0 3 1 1 0]
 [2 0 0 7 5 1]
 [0 0 0 3 1 0]
 [0 0 0 0 2 0]]
Confusion matrix of TU130
[[ 0  0  0  1  0  0]
 [ 0  0  0  2  0  0]
 [ 0  0  3  4  0  0]
 [ 0  0  8 58  7  0]
 [ 0  0  0 18 10  0]
 [ 0  0  0  1  2  0]]
Confusion matrix of TU154
[[ 1  1  6 18  0  0]
 [ 2  0  7  9  0  0]
 [ 3  2  7 44  0  0]
 [ 6  2 12 43  1  0]
 [ 0  0  3  9  0  0]
 [ 0  0  1 13  0  0]]

In [1]:
from sklearn.metrics import mean_squared_error

In [234]:
import pandas as pd
import numpy as np
from collections import defaultdict

df_file = pd.read_csv('../data/CS_table_No2_No4_new.csv',delimiter=";", skip_blank_lines = True, 
                 error_bad_lines=False)

In [235]:
for subject in subjects:
                 
    df_sub = df_file[df_file['3COURSEID'] == subject]
    df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
    count_enrollment = df_sub['3COURSEID'].value_counts()
    #print "Number of %s enrollment: %s"%(subject,count_enrollment)

    A = df_sub.as_matrix()
    X = A[:,6:116]
    X = X.astype(np.int64, copy=False)
    y = A[:,2]
    y = y.astype(np.int64, copy=False)

    #Training data
    forest = RandomForestClassifier(n_estimators=10, max_depth=None, 
            min_samples_split=1, random_state=None, max_features=None)
    clf = forest.fit(X, y)
    
    f = "tree_drop/tree%s.pic"%subject
    with open(f, 'wb') as pickleFile:
        pickle.dump(clf, pickleFile, pickle.HIGHEST_PROTOCOL)


Out[235]:
STUDENTID ACADYEAR SEMESTER CAMPUSID COURSEID CAMPUSNAME CURRIC COURSENAME SECTIONGROUP CREDIT GRADE
0 316644 2552 1 2 CS101 RANGSIT 521 DISCRETE STRUCTURES 10001 3 C
1 316644 2552 1 2 CS102 RANGSIT 521 COMPUTER PROGRAMMING FUNDAMENTALS 20301 4 C
2 316644 2552 1 2 EL171 RANGSIT 521 ENGLISH COURSE 2 100003 3 D
3 316644 2552 1 2 SC135 RANGSIT 521 GENERAL PHYSICS 80001 3 F
4 316644 2552 1 2 SC185 RANGSIT 521 GENERAL PHYSICS LABORATORY 5401 1 C
5 316644 2552 1 2 TH161 RANGSIT 521 THAI USAGE 90004 3 C
6 316644 2552 1 2 TU154 RANGSIT 521 FOUNDATION OF MATHEMATICS 70001 3 D
7 316644 2552 2 2 CS111 RANGSIT 521 OBJECT-ORIENTED PROGRAMMING 40501 4 D+
8 316644 2552 2 2 EL172 RANGSIT 521 ENGLISH COURSE 3 100003 3 F
9 316644 2552 2 2 MA211 RANGSIT 521 CALCULUS 1 90001 3 F
10 316644 2552 2 2 PY228 RANGSIT 521 PSYCHOLOGY OF INTERPERSONAL RELATIONS 10001 3 B
11 316644 2552 2 2 TU110 RANGSIT 521 INTEGRATED HUMANITIES 320001 3 C+
12 316644 2552 2 2 TU120 RANGSIT 521 INTEGRATED SOCIAL SCIENCES 870001 3 D+
13 316644 2552 2 2 TU130 RANGSIT 521 INTEGRATED SCIENCES AND TECHNOLOGY 230001 3 B
14 316644 2552 3 1 TU122 THA PRACHAN 521 LAW IN EVERYDAY LIFE 824501 3 B+
15 316644 2553 1 2 AT326 RANGSIT 521 RICE AND PRODUCTION TECHNOLOGY 10001 3 A
16 316644 2553 1 2 CS213 RANGSIT 521 DATA STRUCTURES 50002 3 C
17 316644 2553 1 2 CS214 RANGSIT 521 SOCIAL AND PROFESSIONAL ETHICS 1 1 B
18 316644 2553 1 2 CS222 RANGSIT 521 PROGRAMMING LANGUAGES AND PARADIGMS 910001 3 B
19 316644 2553 1 2 CS223 RANGSIT 521 COMPUTER ORGANIZATION AND ARCHITECTURE 40002 3 B
20 316644 2553 1 2 CS284 RANGSIT 521 INTRODUCTION TO SOFTWARE ENGINEERING 900001 3 B
21 316644 2553 1 2 MA211 RANGSIT 521 CALCULUS 1 820001 3 D
22 316644 2553 1 2 SW111 RANGSIT 521 PHILOSOPHICAL FOUNDATIONS AND PROFESSIONAL ETHICS 810001 3 D+
23 316644 2553 2 2 AT316 RANGSIT 521 INDUSTRIAL AND POWER PLANTS 60001 3 B+
24 316644 2553 2 2 CS251 RANGSIT 521 DATABASE SYSTEMS 1 20002 3 C+
25 316644 2553 2 2 CS261 RANGSIT 521 HUMAN INFORMATION PROCESSING 90001 3 B
26 316644 2553 2 2 CS281 RANGSIT 521 OBJECT-ORIENTED ANALYSIS AND DESIGN 780001 3 B+
27 316644 2553 2 2 MA332 RANGSIT 521 LINEAR ALGEBRA 30001 3 C
28 316644 2553 2 2 SC135 RANGSIT 521 GENERAL PHYSICS 830001 3 C
29 316644 2553 2 2 ST216 RANGSIT 521 STATISTICS FOR SOCIAL SCIENCE 1 40001 3 C
... ... ... ... ... ... ... ... ... ... ... ...
31314 447242 2557 1 2 CS102 RANGSIT 561 COMPUTER PROGRAMMING FUNDAMENTALS 650001 3 D+
31315 447242 2557 1 2 CS105 RANGSIT 561 PRACTICUM FOR STRUCTURAL PROGRAMMING 650001 1 D+
31316 447242 2557 1 2 EL171 RANGSIT 561 ENGLISH COURSE 2 650001 0 W
31317 447242 2557 1 2 MA211 RANGSIT 561 CALCULUS 1 650001 0 W
31318 447242 2557 1 2 ST216 RANGSIT 561 STATISTICS FOR SOCIAL SCIENCE 1 650001 0 W
31319 447242 2557 1 2 TH161 RANGSIT 561 THAI USAGE 650001 3 C
31320 447242 2557 1 2 TU154 RANGSIT 561 FOUNDATION OF MATHEMATICS 650001 3 D
31321 447242 2557 2 2 CS111 RANGSIT 561 OBJECT-ORIENTED PROGRAMMING 650002 3 NaN
31322 447242 2557 2 2 EL171 RANGSIT 561 ENGLISH COURSE 2 650001 3 NaN
31323 447242 2557 2 2 PY228 RANGSIT 561 PSYCHOLOGY OF INTERPERSONAL RELATIONS 650001 3 NaN
31324 447242 2557 2 2 SC135 RANGSIT 561 GENERAL PHYSICS 650001 3 NaN
31325 447242 2557 2 2 SC185 RANGSIT 561 GENERAL PHYSICS LABORATORY 650001 0 W
31326 447242 2557 2 2 SW365 RANGSIT 561 FAMILY LIFE EDUCATION SEX EDUCATION AND SEXUAL... 650001 3 NaN
31327 447242 2557 2 2 TU100 RANGSIT 561 CIVIC EDUCATION 650001 3 NaN
31328 447242 2557 2 2 TU120 RANGSIT 561 INTEGRATED SOCIAL SCIENCES 650001 2 NaN
31329 447243 2557 1 2 CS101 RANGSIT 561 DISCRETE STRUCTURES 650002 3 F
31330 447243 2557 1 2 CS102 RANGSIT 561 COMPUTER PROGRAMMING FUNDAMENTALS 650002 3 F
31331 447243 2557 1 2 CS105 RANGSIT 561 PRACTICUM FOR STRUCTURAL PROGRAMMING 650002 1 F
31332 447243 2557 1 2 EL070 RANGSIT 561 ENGLISH COURSE 1 650001 3 U#
31333 447243 2557 1 2 MA211 RANGSIT 561 CALCULUS 1 650001 3 F
31334 447243 2557 1 2 ST216 RANGSIT 561 STATISTICS FOR SOCIAL SCIENCE 1 650001 3 F
31335 447243 2557 1 2 TH161 RANGSIT 561 THAI USAGE 650002 3 F
31336 447243 2557 1 2 TU154 RANGSIT 561 FOUNDATION OF MATHEMATICS 650001 3 F
31337 447243 2557 2 2 CS111 RANGSIT 561 OBJECT-ORIENTED PROGRAMMING 650002 3 NaN
31338 447243 2557 2 2 CS115 RANGSIT 561 PRACTICUM FOR OBJECT-ORIENTED PROGRAMMING 650001 1 NaN
31339 447243 2557 2 2 PY228 RANGSIT 561 PSYCHOLOGY OF INTERPERSONAL RELATIONS 650001 3 NaN
31340 447243 2557 2 2 SC135 RANGSIT 561 GENERAL PHYSICS 650001 3 NaN
31341 447243 2557 2 2 SC185 RANGSIT 561 GENERAL PHYSICS LABORATORY 650001 1 NaN
31342 447243 2557 2 2 TU100 RANGSIT 561 CIVIC EDUCATION 650001 3 NaN
31343 447243 2557 2 2 TU120 RANGSIT 561 INTEGRATED SOCIAL SCIENCES 650001 2 NaN

31344 rows × 11 columns


In [ ]: