Mean Squared Error

1.Load dataset(csv)
2.Split dataset to X_train,X_test,y_train,y_test
3.Run classifier, by .fit(X_train,y_train)
4.mean_squared_error(y_test, clf.predict(X_test))

In [2]:
import pandas as pd
import numpy as np
import pickle
import xlwt
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

In [3]:
df_file = pd.read_csv('../data/df_dropSub_less20_dropNaResult.csv',delimiter=",", skip_blank_lines = True, 
                 error_bad_lines=False)
df_file = df_file.drop('Unnamed: 0',axis=1)
df_file = df_file.fillna(0)
df_file = df_file.replace(['A', 'B+', 'B', 'C+', 'C' , 'D+' , 'D' , 'F' , 'W' , 'S' , 'S#' , 'U' , 'U#'], 
                     [8, 7, 7, 6 , 6, 5, 5, 4, 3, 2, 2, 1, 1])

In [4]:
df_file


Out[4]:
Unnamed: 0.1 3COURSEID 4RESULT 0STUDENTID 1ACADYEAR 2SEMESTER AT316 AT326 BA291 CJ315 ... TA395 TH161 TU100 TU110 TU120 TU122 TU130 TU154 PROVINCEID SCHOOLGPA
0 0 CS101 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
1 1 CS102 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
2 2 EL171 5 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
3 3 SC135 4 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
4 4 SC185 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
5 5 TH161 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
6 6 TU154 5 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
7 7 CS111 5 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
8 8 EL172 4 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
9 9 MA211 4 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
10 10 PY228 7 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
11 11 TU110 6 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
12 12 TU120 5 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
13 13 TU130 7 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
14 14 TU122 7 316644 2552 3 0 0 0 0 ... 0 6 0 6 5 0 7 5 12 3.32
15 15 AT326 8 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
16 16 CS213 6 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
17 17 CS214 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
18 18 CS222 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
19 19 CS223 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
20 20 CS284 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
21 21 MA211 5 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
22 22 SW111 5 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
23 23 AT316 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
24 24 CS251 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
25 25 CS261 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
26 26 CS281 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
27 27 MA332 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
28 28 SC135 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
29 29 ST216 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27965 31292 EL070 2 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27966 31293 MA211 4 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27967 31294 ST216 4 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27968 31295 TH161 6 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27969 31296 TU154 4 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27970 31297 CS101 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27971 31298 CS102 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27972 31299 CS105 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27973 31300 EL070 2 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27974 31301 MA211 3 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27975 31302 ST216 3 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27976 31303 TH161 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27977 31304 TU154 3 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27978 31313 CS101 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27979 31314 CS102 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27980 31315 CS105 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27981 31316 EL171 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27982 31317 MA211 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27983 31318 ST216 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27984 31319 TH161 6 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27985 31320 TU154 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27986 31325 SC185 3 447242 2557 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 34 2.93
27987 31329 CS101 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27988 31330 CS102 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27989 31331 CS105 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27990 31332 EL070 1 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27991 31333 MA211 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27992 31334 ST216 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27993 31335 TH161 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27994 31336 TU154 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08

27995 rows × 118 columns


In [5]:
count_courseId = df_file["3COURSEID"].value_counts() 
more20 = count_courseId

headers=list(df_file.columns.values)
subjects = []
countSub = 0

In [8]:
more20[:10]   #top 10 of number of enrollment


Out[8]:
MA211    1068
CS102     966
CS101     948
TU154     946
TH161     899
CS111     834
CS213     765
EL171     760
SC135     740
PY228     705
dtype: int64

In [10]:
for sub in df_file[headers[1]]:
    if sub not in subjects:
        subjects.append(sub)
        countSub = countSub+1

In [12]:
subjects.remove('CS231')
subjects.sort()

In [13]:
subjects


Out[13]:
['AT316',
 'AT326',
 'BA291',
 'CJ315',
 'CJ316',
 'CJ317',
 'CJ321',
 'CS101',
 'CS102',
 'CS105',
 'CS111',
 'CS115',
 'CS211',
 'CS213',
 'CS214',
 'CS215',
 'CS222',
 'CS223',
 'CS251',
 'CS261',
 'CS281',
 'CS284',
 'CS285',
 'CS286',
 'CS288',
 'CS289',
 'CS295',
 'CS296',
 'CS297',
 'CS300',
 'CS301',
 'CS302',
 'CS311',
 'CS314',
 'CS326',
 'CS341',
 'CS342',
 'CS348',
 'CS356',
 'CS365',
 'CS366',
 'CS367',
 'CS374',
 'CS377',
 'CS385',
 'CS386',
 'CS387',
 'CS388',
 'CS395',
 'CS396',
 'CS397',
 'CS398',
 'CS399',
 'CS401',
 'CS402',
 'CS407',
 'CS408',
 'CS409',
 'CS426',
 'CS427',
 'CS429',
 'CS446',
 'CS449',
 'CS456',
 'CS457',
 'CS459',
 'CS467',
 'CS486',
 'CS487',
 'CS488',
 'CS489',
 'EL070',
 'EL171',
 'EL172',
 'EL295',
 'EL395',
 'ES356',
 'HO201',
 'HR201',
 'LA209',
 'MA211',
 'MA212',
 'MA216',
 'MA332',
 'MW313',
 'MW314',
 'NS132',
 'PY228',
 'SC123',
 'SC135',
 'SC173',
 'SC185',
 'SO201',
 'ST216',
 'SW111',
 'SW212',
 'SW213',
 'SW221',
 'SW335',
 'SW365',
 'SW475',
 'SW478',
 'TA395',
 'TH161',
 'TU100',
 'TU110',
 'TU120',
 'TU122',
 'TU130',
 'TU154']

In [55]:
subject = 'EL070'
df_sub = df_file[df_file['3COURSEID'] == subject]
df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
count_enrollment = df_sub['3COURSEID'].value_counts()
#print "Number of %s enrollment: %s"%(subject,count_enrollment)

A = df_sub.as_matrix()
X = A[:,6:116]
X = X.astype(np.int64, copy=False)
y = A[:,2]
y = y.astype(np.int64, copy=False)

In [56]:
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)

In [57]:
X_train


Out[57]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [58]:
X_train.shape


Out[58]:
(183L, 110L)

In [59]:
X_test.shape


Out[59]:
(46L, 110L)

In [69]:
y_train


Out[69]:
array([2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2,
       1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 3, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [61]:
y_test.shape


Out[61]:
(46L,)

In [62]:
forest = RandomForestClassifier(n_estimators=10, max_depth=None, 
            min_samples_split=1, random_state=None, max_features=None)
clf = forest.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [63]:
mse = mean_squared_error(y_test,y_pred)

In [64]:
mse


Out[64]:
0.065217391304347824

In [68]:
scores = cross_val_score(clf, X_train, y_train, cv=5)


C:\Anaconda\lib\site-packages\sklearn\cross_validation.py:417: Warning: The least populated class in y has only 2 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)

In [66]:
scores


Out[66]:
array([ 0.89361702,  0.89130435,  0.89130435,  0.89130435,  0.93181818])

In [67]:
scores.mean()


Out[67]:
0.89986964931460756

In [72]:
from mlboost.util.confusion_matrix import ConfMatrix  
 clf.fit(X_train, y_train)  
 pred = clf.predict(X_train)  
 labels = list(set(y_train))  
 labels.sort()  
 cm = ConfMatrix(metrics.confusion_matrix(y_train, pred), labels)  
 cm.save_matrix('conf_matrix.p')  
 cm.get_classification()  
 cm.gen_conf_matrix('conf_matrix')  
 cm.gen_highlights('conf_matrix_highlights')


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-72-45b928b74d4a> in <module>()
----> 1 from mlboost.util.confusion_matrix import ConfMatrix
      2 clf.fit(X_train, y_train)
      3 pred = clf.predict(X_train)
      4 labels = list(set(y_train))
      5 labels.sort()

ImportError: No module named mlboost.util.confusion_matrix

In [ ]: