In [2]:
import pandas as pd
import numpy as np
import pickle
import xlwt
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
In [3]:
df_file = pd.read_csv('../data/df_dropSub_less20_dropNaResult.csv',delimiter=",", skip_blank_lines = True,
error_bad_lines=False)
df_file = df_file.drop('Unnamed: 0',axis=1)
df_file = df_file.fillna(0)
df_file = df_file.replace(['A', 'B+', 'B', 'C+', 'C' , 'D+' , 'D' , 'F' , 'W' , 'S' , 'S#' , 'U' , 'U#'],
[8, 7, 7, 6 , 6, 5, 5, 4, 3, 2, 2, 1, 1])
In [4]:
df_file
Out[4]:
In [5]:
count_courseId = df_file["3COURSEID"].value_counts()
more20 = count_courseId
headers=list(df_file.columns.values)
subjects = []
countSub = 0
In [8]:
more20[:10] #top 10 of number of enrollment
Out[8]:
In [10]:
for sub in df_file[headers[1]]:
if sub not in subjects:
subjects.append(sub)
countSub = countSub+1
In [12]:
subjects.remove('CS231')
subjects.sort()
In [13]:
subjects
Out[13]:
In [55]:
subject = 'EL070'
df_sub = df_file[df_file['3COURSEID'] == subject]
df_sub = df_sub.iloc[np.random.permutation(len(df_sub))]
count_enrollment = df_sub['3COURSEID'].value_counts()
#print "Number of %s enrollment: %s"%(subject,count_enrollment)
A = df_sub.as_matrix()
X = A[:,6:116]
X = X.astype(np.int64, copy=False)
y = A[:,2]
y = y.astype(np.int64, copy=False)
In [56]:
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)
In [57]:
X_train
Out[57]:
In [58]:
X_train.shape
Out[58]:
In [59]:
X_test.shape
Out[59]:
In [69]:
y_train
Out[69]:
In [61]:
y_test.shape
Out[61]:
In [62]:
forest = RandomForestClassifier(n_estimators=10, max_depth=None,
min_samples_split=1, random_state=None, max_features=None)
clf = forest.fit(X_train, y_train)
y_pred = clf.predict(X_test)
In [63]:
mse = mean_squared_error(y_test,y_pred)
In [64]:
mse
Out[64]:
In [68]:
scores = cross_val_score(clf, X_train, y_train, cv=5)
In [66]:
scores
Out[66]:
In [67]:
scores.mean()
Out[67]:
In [72]:
from mlboost.util.confusion_matrix import ConfMatrix
clf.fit(X_train, y_train)
pred = clf.predict(X_train)
labels = list(set(y_train))
labels.sort()
cm = ConfMatrix(metrics.confusion_matrix(y_train, pred), labels)
cm.save_matrix('conf_matrix.p')
cm.get_classification()
cm.gen_conf_matrix('conf_matrix')
cm.gen_highlights('conf_matrix_highlights')
In [ ]: