In [1]:
import pandas as pd
import numpy as np
import pickle
import xlwt
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
In [2]:
df_file = pd.read_csv('../data/df_dropSub_less20.csv',delimiter=",", skip_blank_lines = True,
error_bad_lines=False)
df_file = df_file.fillna(0)
df_file = df_file.replace(['A', 'B+', 'B', 'C+', 'C' , 'D+' , 'D' , 'F' , 'W' , 'S' , 'S#' , 'U' , 'U#'],
[8, 7, 7, 6 , 6, 5, 5, 4, 3, 2, 2, 1, 1])
In [3]:
df_file
Out[3]:
In [4]:
headers=list(df_file.columns.values)
subjects = []
countSub = 0
#Create dictionary of list subjects
for sub in df_file[headers[1]]:
if sub not in subjects:
subjects.append(sub)
countSub = countSub+1
In [5]:
subjects.sort()
In [6]:
len(subjects)
Out[6]:
In [7]:
for subject in subjects:
#Create new Dataframe
#print subject
df_sub = df_file[df_file['3COURSEID'] == subject]
df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
count_enrollment = df_sub['3COURSEID'].value_counts()
#print "Number of %s enrollment: %s"%(subject,count_enrollment)
A = df_sub.as_matrix()
X = A[:,6:117]
X = X.astype(np.int64, copy=False)
y = A[:,2]
y = y.astype(np.int64, copy=False)
In [8]:
subject = 'CS105'
df_sub = df_file[df_file['3COURSEID'] == subject]
df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
count_enrollment = df_sub['3COURSEID'].value_counts()
#print "Number of %s enrollment: %s"%(subject,count_enrollment)
A = df_sub.as_matrix()
X = A[:,6:117]
X = X.astype(np.int64, copy=False)
y = A[:,2]
y = y.astype(np.int64, copy=False)
In [9]:
subject
Out[9]:
In [10]:
X
Out[10]:
In [11]:
X.shape
Out[11]:
In [12]:
y.shape
Out[12]:
In [13]:
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)
In [14]:
X_train
Out[14]:
In [15]:
X.shape
Out[15]:
In [16]:
X_test
Out[16]:
In [17]:
X_test.shape
Out[17]:
In [18]:
y_train
Out[18]:
In [19]:
y_train.shape
Out[19]:
In [20]:
y_test
Out[20]:
In [21]:
y_test.shape
Out[21]:
In [22]:
forest = RandomForestClassifier(n_estimators=10, max_depth=None,
min_samples_split=1, random_state=None, max_features=None)
clf = forest.fit(X_train, y_train)
In [23]:
y_pred = clf.predict(X_test)
In [24]:
y_pred.shape
Out[24]:
In [25]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
Out[25]:
In [26]:
y_pred
Out[26]:
In [27]:
y_test
Out[27]:
In [28]:
Grade = ['A', 'B', 'C' , 'D' , 'F' , 'W' , 'S' , 'U' ,'na']
row = []
for cls in y_train:
if cls not in row:
row.append(cls)
row.sort()
print row
row_cm = []
for i in xrange(len(row)):
Grade = ['A', 'B', 'C' , 'D' , 'F' , 'W' , 'S' , 'U' ,'na']
grade = Grade[::-1][row[i]]
print grade
row_cm.append(grade)
print row_cm
In [29]:
row_cm
Out[29]:
In [48]:
def plot_confusion_matrix(cm, title=subject, cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(row_cm))
plt.xticks(tick_marks, row_cm, rotation=45)
plt.yticks(tick_marks, row_cm)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [49]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix')
print(cm)
plt.figure()
plot_confusion_matrix(cm)
In [50]:
cm
Out[50]:
In [51]:
plt.show()
In [ ]: