In [1]:
import pandas as pd
import numpy as np
import pickle
import xlwt
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
df_file = pd.read_csv('../data/df_dropSub_less20_dropNaResult.csv',delimiter=",", skip_blank_lines = True, 
                 error_bad_lines=False)
df_file = df_file.drop('Unnamed: 0',axis=1)
df_file = df_file.fillna(0)
df_file = df_file.replace(['A', 'B+', 'B', 'C+', 'C' , 'D+' , 'D' , 'F' , 'W' , 'S' , 'S#' , 'U' , 'U#'], 
                     [8, 7, 7, 6 , 6, 5, 5, 4, 3, 2, 2, 1, 1])

In [3]:
df_file


Out[3]:
Unnamed: 0.1 3COURSEID 4RESULT 0STUDENTID 1ACADYEAR 2SEMESTER AT316 AT326 BA291 CJ315 ... TA395 TH161 TU100 TU110 TU120 TU122 TU130 TU154 PROVINCEID SCHOOLGPA
0 0 CS101 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
1 1 CS102 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
2 2 EL171 5 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
3 3 SC135 4 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
4 4 SC185 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
5 5 TH161 6 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
6 6 TU154 5 316644 2552 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 12 3.32
7 7 CS111 5 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
8 8 EL172 4 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
9 9 MA211 4 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
10 10 PY228 7 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
11 11 TU110 6 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
12 12 TU120 5 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
13 13 TU130 7 316644 2552 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 12 3.32
14 14 TU122 7 316644 2552 3 0 0 0 0 ... 0 6 0 6 5 0 7 5 12 3.32
15 15 AT326 8 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
16 16 CS213 6 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
17 17 CS214 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
18 18 CS222 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
19 19 CS223 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
20 20 CS284 7 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
21 21 MA211 5 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
22 22 SW111 5 316644 2553 1 0 0 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
23 23 AT316 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
24 24 CS251 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
25 25 CS261 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
26 26 CS281 7 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
27 27 MA332 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
28 28 SC135 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
29 29 ST216 6 316644 2553 2 0 8 0 0 ... 0 6 0 6 5 7 7 5 12 3.32
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27965 31292 EL070 2 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27966 31293 MA211 4 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27967 31294 ST216 4 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27968 31295 TH161 6 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27969 31296 TU154 4 447240 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 48 3.75
27970 31297 CS101 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27971 31298 CS102 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27972 31299 CS105 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27973 31300 EL070 2 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27974 31301 MA211 3 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27975 31302 ST216 3 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27976 31303 TH161 5 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27977 31304 TU154 3 447241 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 13 2.60
27978 31313 CS101 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27979 31314 CS102 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27980 31315 CS105 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27981 31316 EL171 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27982 31317 MA211 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27983 31318 ST216 3 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27984 31319 TH161 6 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27985 31320 TU154 5 447242 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 34 2.93
27986 31325 SC185 3 447242 2557 2 0 0 0 0 ... 0 6 0 0 0 0 0 5 34 2.93
27987 31329 CS101 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27988 31330 CS102 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27989 31331 CS105 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27990 31332 EL070 1 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27991 31333 MA211 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27992 31334 ST216 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27993 31335 TH161 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08
27994 31336 TU154 4 447243 2557 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 84 2.08

27995 rows × 118 columns


In [4]:
count_courseId = df_file["3COURSEID"].value_counts() 
more20 = count_courseId

headers=list(df_file.columns.values)
subjects = []
countSub = 0

more20


Out[4]:
MA211    1068
CS102     966
CS101     948
TU154     946
TH161     899
CS111     834
CS213     765
EL171     760
SC135     740
PY228     705
EL172     697
SC185     686
TU110     666
CS223     664
TU120     648
ST216     623
CS284     616
EL295     588
TU130     567
MA212     539
MA332     509
CS314     509
CS222     490
CS214     488
CS261     481
CS251     471
CS281     470
CS341     443
EL395     442
CS301     421
         ... 
CS286      53
CS297      53
CS429      51
CS446      50
SW335      50
CS356      49
HR201      49
CS459      48
NS132      44
SO201      44
TA395      42
CJ321      41
CS397      41
CS398      40
CS348      38
CJ317      37
CJ316      36
MW313      36
MA216      35
CS407      35
CS115      34
CS457      32
CS388      31
CS449      30
CS426      30
CS408      27
CJ315      26
CS399      24
CS285      24
CS231       1
dtype: int64

In [5]:
count = 0
subjects.sort()
precision_rf={}
df_precision = more20.drop('CS231').copy()  #Create to add new data with score precision

list_allsub = df_file.columns[4:]
allSubject_df = pd.DataFrame(columns=[subjects],index=[list_allsub])
top10_df = pd.DataFrame(columns=[subjects])

In [6]:
len(df_precision)


Out[6]:
110

In [7]:
headers=list(df_file.columns.values)
subjects = []
countSub = 0
#Create dictionary of list subjects
for sub in df_file[headers[1]]:
    if sub not in subjects:
        subjects.append(sub)
        countSub = countSub+1

In [25]:
subjects.sort()
for subject in subjects:
    #Create new Dataframe
#subject = 'MA211'
    #print subject             
    df_sub = df_file[df_file['3COURSEID'] == subject]
    df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
    count_enrollment = df_sub['3COURSEID'].value_counts()
    #print "Number of %s enrollment: %s"%(subject,count_enrollment)

    A = df_sub.as_matrix()
    X = A[:,6:117]
    X = X.astype(np.int64, copy=False)
    y = A[:,2]
    y = y.astype(np.int64, copy=False)

    # Split the data into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)

    forest = RandomForestClassifier(n_estimators=10, max_depth=None, 
            min_samples_split=1, random_state=None, max_features=None)
    clf = forest.fit(X_train,y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None)
    recall = recall_score(y_test, y_pred, average=None)
    print "Accuracy of %s: %s"%(subject,accuracy)
    print "Precision of %s: %s"%(subject,precision)
    print "Recall of %s: %s"%(subject,recall)
    print "------------------------------"
    #     precision_rf[subject] = scores.mean()
    #     df_precision.loc[subject]=precision_rf[subject]


Accuracy of AT316: 0.51724137931
Precision of AT316: [ 0.58333333  0.2       ]
Recall of AT316: [ 0.77777778  0.09090909]
------------------------------
Accuracy of AT326: 0.688888888889
Precision of AT326: [ 1.          0.73529412  0.5       ]
Recall of AT326: [ 0.5         0.83333333  0.38461538]
------------------------------
Accuracy of BA291: 0.333333333333
Precision of BA291: [ 0.          0.2         0.42857143  0.        ]
Recall of BA291: [ 0.   0.2  0.6  0. ]
------------------------------
Accuracy of CJ315: 0.166666666667
Precision of CJ315: [ 0.  1.  0.]
Recall of CJ315: [ 0.   0.2  0. ]
------------------------------
Accuracy of CJ316: 0.75
Precision of CJ316: [ 0.          0.71428571  1.        ]
Recall of CJ316: [ 0.   1.   0.5]
------------------------------
Accuracy of CJ317: 0.75
Precision of CJ317: [ 0.    0.75  1.  ]
Recall of CJ317: [ 0.    0.75  0.75]
------------------------------
Accuracy of CJ321: 0.555555555556
Precision of CJ321: [ 0.   0.5  1. ]
Recall of CJ321: [ 0.    1.    0.25]
------------------------------
Accuracy of CS101: 0.457894736842
Precision of CS101: [ 0.          0.          0.05882353  0.50292398  0.          0.        ]
Recall of CS101: [ 0.          0.          0.04166667  0.88659794  0.          0.        ]
------------------------------
Accuracy of CS102: 0.329896907216
Precision of CS102: [ 0.          0.          0.23529412  0.35820896  0.33333333  0.        ]
Recall of CS102: [ 0.          0.          0.17021277  0.70588235  0.15686275  0.        ]
------------------------------
Accuracy of CS105: 0.233333333333
Precision of CS105: [ 0.          0.          0.25        0.125       0.42857143  0.        ]
Recall of CS105: [ 0.          0.          0.625       0.06666667  0.2         0.        ]
------------------------------
Accuracy of CS111: 0.395209580838
Precision of CS111: [ 0.33333333  0.42105263  0.32        0.37096774  0.4375      0.81818182]
Recall of CS111: [ 0.27272727  0.25806452  0.41025641  0.42592593  0.46666667  0.52941176]
------------------------------
Accuracy of CS115: 0.571428571429
Precision of CS115: [ 0.          0.33333333  0.          1.        ]
Recall of CS115: [ 0.   0.5  0.   1. ]
------------------------------
Accuracy of CS211: 0.444444444444
Precision of CS211: [ 0.5         0.375       0.53846154  0.33333333  0.        ]
Recall of CS211: [ 0.5         0.375       0.58333333  0.25        0.        ]
------------------------------
Accuracy of CS213: 0.509803921569
Precision of CS213: [ 0.          0.4         0.59459459  0.59677419  0.21428571  0.57142857]
Recall of CS213: [ 0.          0.85714286  0.44        0.61666667  0.1875      0.57142857]
------------------------------
Accuracy of CS214: 0.428571428571
Precision of CS214: [ 0.          0.          0.33333333  0.48        0.38888889]
Recall of CS214: [ 0.    0.    0.25  0.48  0.5 ]
------------------------------
Accuracy of CS215: 0.1875
Precision of CS215: [ 0.          0.          0.33333333  0.5         0.        ]
Recall of CS215: [ 0.    0.    0.4   0.25  0.  ]
------------------------------
Accuracy of CS222: 0.602040816327
Precision of CS222: [ 0.          0.          0.35294118  0.65454545  0.7         0.75      ]
Recall of CS222: [ 0.          0.          0.33333333  0.76595745  0.58333333  0.5       ]
------------------------------
Accuracy of CS223: 0.563909774436
Precision of CS223: [ 0.          0.          0.1         0.66666667  0.48148148  0.2       ]
Recall of CS223: [ 0.          0.          0.07142857  0.75949367  0.5         0.14285714]
------------------------------
C:\Anaconda\lib\site-packages\sklearn\metrics\classification.py:958: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
C:\Anaconda\lib\site-packages\sklearn\metrics\classification.py:960: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-25-ec6a61de32eb> in <module>()
     20     forest = RandomForestClassifier(n_estimators=10, max_depth=None, 
     21             min_samples_split=1, random_state=None, max_features=None)
---> 22     clf = forest.fit(X_train,y_train)
     23     y_pred = clf.predict(X_test)
     24 

C:\Anaconda\lib\site-packages\sklearn\ensemble\forest.pyc in fit(self, X, y, sample_weight)
    193         """
    194         # Validate or convert input data
--> 195         X = check_array(X, dtype=DTYPE, accept_sparse="csc")
    196         if issparse(X):
    197             # Pre-sort indices to avoid that each individual tree of the

C:\Anaconda\lib\site-packages\sklearn\utils\validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features)
    358             raise ValueError("Found array with %d sample(s) (shape=%s) while a"
    359                              " minimum of %d is required."
--> 360                              % (n_samples, shape_repr, ensure_min_samples))
    361 
    362     if ensure_min_features > 0 and array.ndim == 2:

ValueError: Found array with 0 sample(s) (shape=(0, 111)) while a minimum of 1 is required.

In [14]:
X_train


Out[14]:
array([[ 0,  0,  0, ...,  0,  0, 80],
       [ 0,  0,  0, ...,  0,  0,  1],
       [ 0,  0,  0, ...,  0,  0, 36],
       ..., 
       [ 0,  0,  0, ...,  0,  0,  1],
       [ 0,  0,  0, ...,  0,  0, 12],
       [ 0,  0,  0, ...,  0,  0, 30]], dtype=int64)

In [ ]: