In [1]:
import pandas as pd
import numpy as np
import pickle
import xlwt
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
df_file = pd.read_csv('../data/df_dropSub_less20.csv',delimiter=",", skip_blank_lines = True, 
                 error_bad_lines=False)
df_file = df_file.fillna(0)
df_file = df_file.replace(['A', 'B+', 'B', 'C+', 'C' , 'D+' , 'D' , 'F' , 'W' , 'S' , 'S#' , 'U' , 'U#'], 
                     [8, 7, 7, 6 , 6, 5, 5, 4, 3, 2, 2, 1, 1])

In [3]:
df_file


Out[3]:
Unnamed: 0 3COURSEID 4RESULT 0STUDENTID 1ACADYEAR 2SEMESTER AT316 AT326 BA291 CJ315 ... TA395 TH161 TU100 TU110 TU120 TU122 TU130 TU154 PROVINCEID SCHOOLGPA
0 0 CS101 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
1 1 CS102 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
2 2 EL171 5 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
3 3 SC135 4 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
4 4 SC185 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
5 5 TH161 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
6 6 TU154 5 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
7 7 CS111 5 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
8 8 EL172 4 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
9 9 MA211 4 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
10 10 PY228 7 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
11 11 TU110 6 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
12 12 TU120 5 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
13 13 TU130 7 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
14 14 TU122 7 316644 2552 3 0 0 0 0 ... 0 6 0 6 5 0 7 5 12 3.32
15 15 AT326 8 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
16 16 CS213 6 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
17 17 CS214 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
18 18 CS222 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
19 19 CS223 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
20 20 CS284 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
21 21 MA211 5 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
22 22 SW111 5 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
23 23 AT316 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
24 24 CS251 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
25 25 CS261 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
26 26 CS281 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
27 27 MA332 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
28 28 SC135 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
29 29 ST216 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
30970 31314 CS102 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
30971 31315 CS105 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
30972 31316 EL171 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
30973 31317 MA211 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
30974 31318 ST216 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
30975 31319 TH161 6 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
30976 31320 TU154 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
30977 31321 CS111 0 447242 2557 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 34 2.93
30978 31322 EL171 0 447242 2557 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 34 2.93
30979 31323 PY228 0 447242 2557 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 34 2.93
30980 31324 SC135 0 447242 2557 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 34 2.93
30981 31325 SC185 3 447242 2557 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 34 2.93
30982 31326 SW365 0 447242 2557 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 34 2.93
30983 31327 TU100 0 447242 2557 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 34 2.93
30984 31328 TU120 0 447242 2557 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 34 2.93
30985 31329 CS101 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
30986 31330 CS102 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
30987 31331 CS105 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
30988 31332 EL070 1 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
30989 31333 MA211 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
30990 31334 ST216 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
30991 31335 TH161 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
30992 31336 TU154 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
30993 31337 CS111 0 447243 2557 2 0 0 0 0 ... 0 4 0 0 0 0 0 4 84 2.08
30994 31338 CS115 0 447243 2557 2 0 0 0 0 ... 0 4 0 0 0 0 0 4 84 2.08
30995 31339 PY228 0 447243 2557 2 0 0 0 0 ... 0 4 0 0 0 0 0 4 84 2.08
30996 31340 SC135 0 447243 2557 2 0 0 0 0 ... 0 4 0 0 0 0 0 4 84 2.08
30997 31341 SC185 0 447243 2557 2 0 0 0 0 ... 0 4 0 0 0 0 0 4 84 2.08
30998 31342 TU100 0 447243 2557 2 0 0 0 0 ... 0 4 0 0 0 0 0 4 84 2.08
30999 31343 TU120 0 447243 2557 2 0 0 0 0 ... 0 4 0 0 0 0 0 4 84 2.08

31000 rows × 119 columns


In [4]:
headers=list(df_file.columns.values)
subjects = []
countSub = 0
#Create dictionary of list subjects
for sub in df_file[headers[1]]:
    if sub not in subjects:
        subjects.append(sub)
        countSub = countSub+1

In [5]:
subjects.sort()

In [6]:
len(subjects)


Out[6]:
111

In [7]:
for subject in subjects:
    #Create new Dataframe
    
    #print subject             
    df_sub = df_file[df_file['3COURSEID'] == subject]
    df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
    count_enrollment = df_sub['3COURSEID'].value_counts()
    #print "Number of %s enrollment: %s"%(subject,count_enrollment)

    A = df_sub.as_matrix()
    X = A[:,6:117]
    X = X.astype(np.int64, copy=False)
    y = A[:,2]
    y = y.astype(np.int64, copy=False)

In [8]:
subject = 'CS105'
df_sub = df_file[df_file['3COURSEID'] == subject]
df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
count_enrollment = df_sub['3COURSEID'].value_counts()
#print "Number of %s enrollment: %s"%(subject,count_enrollment)

A = df_sub.as_matrix()
X = A[:,6:117]
X = X.astype(np.int64, copy=False)
y = A[:,2]
y = y.astype(np.int64, copy=False)

In [9]:
subject


Out[9]:
'CS105'

In [10]:
X


Out[10]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
X.shape


Out[11]:
(298L, 111L)

In [12]:
y.shape


Out[12]:
(298L,)

In [13]:
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)

In [14]:
X_train


Out[14]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
X.shape


Out[15]:
(298L, 111L)

In [16]:
X_test


Out[16]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
X_test.shape


Out[17]:
(60L, 111L)

In [18]:
y_train


Out[18]:
array([5, 6, 7, 6, 3, 6, 6, 5, 7, 3, 5, 6, 7, 8, 3, 6, 5, 7, 8, 4, 6, 3, 6,
       3, 6, 5, 6, 7, 8, 7, 6, 3, 6, 6, 6, 7, 6, 7, 5, 7, 5, 7, 7, 3, 7, 5,
       7, 5, 7, 5, 7, 5, 7, 3, 6, 6, 6, 4, 8, 7, 8, 5, 5, 6, 7, 3, 3, 5, 7,
       5, 6, 5, 3, 5, 7, 6, 5, 6, 5, 5, 7, 7, 5, 6, 4, 3, 7, 8, 4, 6, 7, 5,
       6, 5, 5, 3, 5, 6, 5, 5, 5, 5, 5, 3, 3, 4, 7, 5, 8, 7, 5, 6, 5, 5, 6,
       8, 5, 5, 7, 7, 6, 7, 4, 7, 6, 5, 7, 6, 6, 6, 7, 6, 5, 5, 6, 7, 7, 6,
       6, 7, 5, 5, 5, 6, 7, 5, 7, 6, 5, 4, 8, 6, 5, 6, 7, 3, 7, 7, 5, 4, 6,
       3, 3, 4, 7, 3, 5, 7, 5, 5, 7, 3, 5, 5, 5, 6, 7, 6, 3, 5, 5, 5, 3, 5,
       5, 6, 3, 6, 6, 7, 5, 3, 6, 6, 3, 7, 7, 3, 8, 5, 5, 6, 5, 6, 5, 6, 3,
       5, 6, 8, 5, 7, 6, 6, 7, 5, 6, 7, 8, 6, 8, 6, 8, 8, 5, 8, 3, 5, 5, 7,
       6, 5, 5, 4, 7, 3, 5, 8], dtype=int64)

In [19]:
y_train.shape


Out[19]:
(238L,)

In [20]:
y_test


Out[20]:
array([6, 8, 8, 5, 6, 7, 5, 4, 5, 3, 7, 5, 6, 7, 3, 6, 7, 5, 5, 6, 5, 6, 6,
       5, 5, 7, 6, 5, 5, 8, 5, 8, 8, 6, 3, 7, 3, 6, 6, 5, 5, 6, 5, 5, 7, 7,
       5, 5, 6, 7, 3, 5, 7, 8, 5, 6, 5, 3, 7, 7], dtype=int64)

In [21]:
y_test.shape


Out[21]:
(60L,)

In [22]:
forest = RandomForestClassifier(n_estimators=10, max_depth=None, 
            min_samples_split=1, random_state=None, max_features=None)
clf = forest.fit(X_train, y_train)

In [23]:
y_pred = clf.predict(X_test)

In [24]:
y_pred.shape


Out[24]:
(60L,)

In [25]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()


Confusion matrix, without normalization
[[ 0  0  6  0  0  0]
 [ 0  0  1  0  0  0]
 [ 0  0 21  0  0  0]
 [ 0  0 14  0  0  0]
 [ 0  0 12  0  0  0]
 [ 0  0  6  0  0  0]]
Out[25]:
<matplotlib.figure.Figure at 0x1848e5c0>

In [26]:
y_pred


Out[26]:
array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5], dtype=int64)

In [27]:
y_test


Out[27]:
array([6, 8, 8, 5, 6, 7, 5, 4, 5, 3, 7, 5, 6, 7, 3, 6, 7, 5, 5, 6, 5, 6, 6,
       5, 5, 7, 6, 5, 5, 8, 5, 8, 8, 6, 3, 7, 3, 6, 6, 5, 5, 6, 5, 5, 7, 7,
       5, 5, 6, 7, 3, 5, 7, 8, 5, 6, 5, 3, 7, 7], dtype=int64)

In [28]:
Grade = ['A', 'B', 'C' , 'D' , 'F' , 'W' , 'S' , 'U' ,'na']
row = []
for cls in y_train:
    if cls not in row:
        row.append(cls)
row.sort()
print row

row_cm = []
for i in xrange(len(row)):
    Grade = ['A', 'B', 'C' , 'D' , 'F' , 'W' , 'S' , 'U' ,'na']
    grade = Grade[::-1][row[i]]
    print grade
    row_cm.append(grade)
print row_cm


[3, 4, 5, 6, 7, 8]
W
F
D
C
B
A
['W', 'F', 'D', 'C', 'B', 'A']

In [29]:
row_cm


Out[29]:
['W', 'F', 'D', 'C', 'B', 'A']

In [48]:
def plot_confusion_matrix(cm, title=subject, cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(row_cm))
    plt.xticks(tick_marks, row_cm, rotation=45)
    plt.yticks(tick_marks, row_cm)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [49]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix')
print(cm)
plt.figure()
plot_confusion_matrix(cm)


CS105
[[ 0  0  6  0  0  0]
 [ 0  0  1  0  0  0]
 [ 0  0 21  0  0  0]
 [ 0  0 14  0  0  0]
 [ 0  0 12  0  0  0]
 [ 0  0  6  0  0  0]]

In [50]:
cm


Out[50]:
array([[ 0,  0,  6,  0,  0,  0],
       [ 0,  0,  1,  0,  0,  0],
       [ 0,  0, 21,  0,  0,  0],
       [ 0,  0, 14,  0,  0,  0],
       [ 0,  0, 12,  0,  0,  0],
       [ 0,  0,  6,  0,  0,  0]])

In [51]:
plt.show()

In [ ]: