In [1]:
import pandas as pd
import numpy as np
import pickle
import xlwt
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import KFold
from sklearn.metrics import precision_score
In [2]:
df_file = pd.read_csv('../data/df_dropSub_less20_dropNaResult.csv',delimiter=",", skip_blank_lines = True,
error_bad_lines=False)
df_file = df_file.drop('Unnamed: 0',axis=1)
df_file = df_file.fillna(0)
df_file = df_file.replace(['A', 'B+', 'B', 'C+', 'C' , 'D+' , 'D' , 'F' , 'W' , 'S' , 'S#' , 'U' , 'U#'],
[8, 7, 7, 6 , 6, 5, 5, 4, 3, 2, 2, 1, 1])
In [3]:
df_file
Out[3]:
In [4]:
count_courseId = df_file["3COURSEID"].value_counts()
more20 = count_courseId
headers=list(df_file.columns.values)
subjects = []
countSub = 0
In [5]:
count = 0
subjects.sort()
precision_rf={}
df_precision = more20.drop('CS231').copy()
list_allsub = df_file.columns[4:]
allSubject_df = pd.DataFrame(columns=[subjects],index=[list_allsub])
top10_df = pd.DataFrame(columns=[subjects])
In [6]:
headers=list(df_file.columns.values)
subjects = []
countSub = 0
#Create dictionary of list subjects
for sub in df_file[headers[1]]:
if sub not in subjects:
subjects.append(sub)
countSub = countSub+1
In [7]:
subjects.sort()
In [8]:
subjects.remove('CS231')
In [9]:
len(subjects)
Out[9]:
In [16]:
subject = 'MA211'
df_sub = df_file[df_file['3COURSEID'] == subject]
df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
count_enrollment = df_sub['3COURSEID'].value_counts()
#print "Number of %s enrollment: %s"%(subject,count_enrollment)
A = df_sub.as_matrix()
X = A[:,6:116]
X = X.astype(np.int64, copy=False)
y = A[:,2]
y = y.astype(np.int64, copy=False)
In [17]:
subject
Out[17]:
In [18]:
X
Out[18]:
In [19]:
X.shape
Out[19]:
In [20]:
y
Out[20]:
In [21]:
y.shape
Out[21]:
In [232]:
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)
In [233]:
X_train
Out[233]:
In [234]:
X_train.shape
Out[234]:
In [235]:
X_test
Out[235]:
In [236]:
X_test.shape
Out[236]:
In [97]:
X_train.shape
Out[97]:
In [98]:
y_train
Out[98]:
In [99]:
y_train.shape
Out[99]:
In [100]:
y_test
Out[100]:
In [101]:
y_test.shape
Out[101]:
In [18]:
forest = RandomForestClassifier(n_estimators=10, max_depth=None,
min_samples_split=1, random_state=None, max_features=None)
clf = forest.fit(X, y)
In [ ]:
In [109]:
cm
Out[109]:
In [104]:
X_train
Out[104]:
In [61]:
len(X_train)
Out[61]:
In [24]:
y_pred = clf.predict(X)
In [25]:
y_pred.shape
Out[25]:
In [26]:
# Compute confusion matrix
cm = confusion_matrix(y, y_pred)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
Out[26]:
In [20]:
Grade = ['A', 'B', 'C' , 'D' , 'F' , 'W' , 'S' , 'U' ,'na']
row = []
for cls in y:
if cls not in row:
row.append(cls)
row.sort()
print row
row_cm = []
for i in xrange(len(row)):
Grade = ['A', 'B', 'C' , 'D' , 'F' , 'W' , 'S' , 'U' ,'na']
grade = Grade[::-1][row[i]]
print grade
row_cm.append(grade)
print row_cm
In [21]:
row_cm
Out[21]:
In [22]:
def plot_confusion_matrix(cm, title=subject, cmap=plt.cm.Blues):
for y in range(cm.shape[0]):
for x in range(cm.shape[1]):
plt.text(x, y, '%.1f' % cm[y, x])
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(row_cm))
plt.xticks(tick_marks, row_cm, rotation=45)
plt.yticks(tick_marks, row_cm)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# plt.pcolor(cm, cmap=cmap)
# plt.yticks(np.arange(len(row_cm)),row_cm,label="#Training Data")
# plt.xticks(np.arange(len(row_cm)),row_cm,label='#Class')
# plt.colorbar()
In [34]:
# Compute confusion matrix
cm = confusion_matrix(y, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix')
print(row_cm)
print(cm)
plt.figure()
plot_confusion_matrix(cm)
In [35]:
cm
Out[35]:
In [36]:
plt.show()
In [237]:
for subject in subjects:
#Create new Dataframe
#print subject
df_sub = df_file[df_file['3COURSEID'] == subject]
df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
count_enrollment = df_sub['3COURSEID'].value_counts()
#print "Number of %s enrollment: %s"%(subject,count_enrollment)
A = df_sub.as_matrix()
X = A[:,6:117]
X = X.astype(np.int64, copy=False)
y = A[:,2]
y = y.astype(np.int64, copy=False)
forest = RandomForestClassifier(n_estimators=10, max_depth=None,
min_samples_split=1, random_state=None, max_features=None)
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)
clf = forest.fit(X_train, y_train)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix of %s'%subject)
print(cm)
plt.figure()
# # Split the data into a training set and a test set
# X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)
# forest = RandomForestClassifier(n_estimators=10, max_depth=None,
# min_samples_split=1, random_state=None, max_features=None)
# clf = forest.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# cm = confusion_matrix(y_test, y_pred)
# print('Confusion matrix of %s'%subject)
# print(cm)
# plt.figure()
#plot_confusion_matrix(cm)
In [ ]: