In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas.io.sql as pd_sql
import sqlite3 as sql
%matplotlib inline
In [24]:
con = sql.connect("mooc.db")
In [25]:
FEATURES = [
'studentid',
'clicks',
'plays',
'pauses',
'seeks',
'loads',
'transcript',
'max video',
'module',
'videos_clicked',
'percent_clicked',
'seeks_neutral',
'seeks_fastforward',
'seeks_rewind',
'speed_changes',
'len_vid_sum',
'seek_ff_rate',
'seek_rw_rate',
'spd_ch_rate',
'pause_rate',
'click_rate',
'is_complete',
]
In [26]:
df = pd.read_csv("ml3.csv", sep=',')
In [27]:
LABEL_MAP = {
0: 'NotComplete',
1: 'Completed',
}
In [28]:
for k,v in LABEL_MAP.items():
df.ix[df.is_complete == k, 'is_complete'] = v
In [29]:
df.head()
Out[29]:
In [30]:
df.describe()
Out[30]:
In [31]:
print "{} instances with {} features\n".format(*df.shape)
print df.groupby('module')['module'].count()
In [32]:
df = df.drop(['studentid'], axis=1)
df = df.drop(['plays'], axis=1)
df = df.drop(['pauses'], axis=1)
df = df.drop(['seeks'], axis=1)
df = df.drop(['loads'], axis=1)
df = df.drop(['transcript'], axis=1)
df = df.drop(['seeks_neutral'], axis=1)
df = df.drop(['seeks_fastforward'], axis=1)
df = df.drop(['seeks_rewind'], axis=1)
df = df.drop(['speed_changes'], axis=1)
In [33]:
df['max_module_genomics'] = 8
#df['clicks_per_module'] = df['clicks']/df['videos_clicked']
## doesn't work b/c some students only got to module 0
In [34]:
df.head()
Out[34]:
In [35]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(df['videos_clicked'], bins = 10, range = (0,95))
plt.title('Videos Clicked Count distribution')
plt.xlabel('Videos Clicked')
plt.ylabel('No. of Students')
plt.show()
In [36]:
a = df['seek_ff_rate'].isnull()
a.sum(axis=True)
Out[36]:
In [37]:
#df['seek_ff_rate'].fillna(value=0)
df = df.fillna(0)
In [38]:
df['seek_ff_rate'] = df['seek_ff_rate'].multiply(3600)
df['seek_rw_rate'] = df['seek_rw_rate'].multiply(3600)
df['spd_ch_rate'] = df['spd_ch_rate'].multiply(3600)
df['pause_rate'] = df['pause_rate'].multiply(3600)
df['click_rate'] = df['click_rate'].multiply(3600)
In [39]:
df.head()
Out[39]:
In [40]:
df[df.is_complete == "Completed"].head()
Out[40]:
In [41]:
df[df.is_complete == "NotComplete"].head()
Out[41]:
In [42]:
len(df[df.is_complete == "Completed"])
Out[42]:
In [43]:
len(df[df.is_complete == "NotComplete"])
Out[43]:
In [44]:
df = df.drop(['len_vid_sum'], axis=1)
In [45]:
df = df.drop(['videos_clicked'], axis=1)
In [46]:
df = df.drop(['max_video'], axis=1)
In [47]:
df = df.drop(['max_module_genomics'], axis=1)
In [48]:
df = df.drop(['clicks'], axis=1)
In [61]:
# Create a scatter matrix of the dataframe features
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df, alpha=0.2, figsize=(12, 12), diagonal='kde')
plt.show()
In [62]:
from pandas.tools.plotting import parallel_coordinates
plt.figure(figsize=(12,12))
parallel_coordinates(df, 'is_complete')
plt.show()
In [63]:
from pandas.tools.plotting import radviz
plt.figure(figsize=(12,12))
radviz(df, 'is_complete')
plt.show()
In [49]:
# These features are skewing the ML estimator results. F1 scores of .985 and 1.0 are
# not realistic. Given how the data is laid out, these features need to be dropped.
# If the data was organized differently, as in, percent_completed up to this specific
# point in time, then that would be a very helpful feature. But percent_complete of
# the entire course, run at the end of the course, skews the data towards those who
# completed the course. It does not help when we are half-way through the course and
# we want to predict drop-outs and completers. A good next step would be to calculate
# the percent_complete based the specific point in time the estimator is run. And then,
# I would assume, this information would be applicable and would not skew the results.
df = df.drop(['module'], axis=1)
df = df.drop(['percent_clicked'], axis=1)
In [50]:
from pandas.tools.plotting import radviz
plt.figure(figsize=(12,12))
radviz(df, 'is_complete')
plt.show()
In [66]:
#df = df.drop(['click_rate'], axis=1)
In [67]:
"""
from pandas.tools.plotting import radviz
plt.figure(figsize=(12,12))
radviz(df, 'is_complete')
plt.show()
"""
In [51]:
df.loc[df.is_complete == "NotComplete", 'is_complete'] = 0
df.loc[df.is_complete == "Completed", 'is_complete'] = 1
In [52]:
df.head()
Out[52]:
In [53]:
df.to_csv('capstone_ml/mooc_dataset.csv', sep=' ')
In [2]:
import os
import json
import time
import pickle
In [4]:
from sklearn.datasets.base import Bunch
DATA_DIR = os.path.abspath(os.path.join(".", "..", "capstone_ml"))
# Show the contents of the data directory
for name in os.listdir(DATA_DIR):
if name.startswith("."): continue
print "- {}".format(name)
In [7]:
def load_data(root=DATA_DIR):
# Construct the `Bunch` for the mooc dataset
filenames = {
'meta': os.path.join(root, 'meta.json'),
'rdme': os.path.join(root, 'README.md'),
'data': os.path.join(root, 'mooc_dataset.csv'),
}
# Load the meta data from the meta json
with open(filenames['meta'], 'r') as f:
meta = json.load(f)
target_names = meta['target_names']
feature_names = meta['feature_names']
# Load the description from the README.
with open(filenames['rdme'], 'r') as f:
DESCR = f.read()
# Load the dataset from the text file.
dataset = np.loadtxt(filenames['data'])
# Extract the target from the data
data = dataset[:, 1:-1] #colon means all the rows; 0:-1 means rows from first to second to last
target = dataset[:, -1] #take last column for all the rows
# Create the bunch object
return Bunch(
data=data,
target=target,
filenames=filenames,
target_names=target_names,
feature_names=feature_names,
DESCR=DESCR
)
# Save the dataset as a variable we can use.
dataset = load_data()
print dataset.data.shape
print dataset.target.shape
In [52]:
from sklearn import metrics
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
In [12]:
def fit_and_evaluate(dataset, model, label, **kwargs):
"""
Because of the Scikit-Learn API, we can create a function to
do all of the fit and evaluate work on our behalf!
"""
start = time.time() # Start the clock!
scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}
for train, test in KFold(dataset.data.shape[0], n_folds=12, shuffle=True):
X_train, X_test = dataset.data[train], dataset.data[test]
y_train, y_test = dataset.target[train], dataset.target[test]
estimator = model(**kwargs)
estimator.fit(X_train, y_train)
expected = y_test
predicted = estimator.predict(X_test)
# Append our scores to the tracker
scores['precision'].append(metrics.precision_score(expected, predicted, average='binary'))
scores['recall'].append(metrics.recall_score(expected, predicted, average='binary'))
scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
scores['f1'].append(metrics.f1_score(expected, predicted, average='binary'))
# Report
print "Build and Validation of {} took {:0.3f} seconds".format(label, time.time()-start)
print "Validation scores are as follows:\n"
print pd.DataFrame(scores).mean()
# Write official estimator to disk
estimator = model(**kwargs)
estimator.fit(dataset.data, dataset.target)
outpath = label.lower().replace(" ", "-") + ".pickle"
with open(outpath, 'w') as f:
pickle.dump(estimator, f)
print "\nFitted model written to:\n{}".format(os.path.abspath(outpath))
In [61]:
# Perform SVC Classification
fit_and_evaluate(dataset, SVC, "MOOC SVM Classifier")
In [45]:
# Perform kNN Classification
fit_and_evaluate(dataset, KNeighborsClassifier, "MOOC kNN Classifier", n_neighbors=3)
In [55]:
# Perform Random Forest Classification
fit_and_evaluate(dataset, RandomForestClassifier, "MOOC Random Forest Classifier")
In [61]:
# Perform Decision Tree Classifier
fit_and_evaluate(dataset, DecisionTreeClassifier, "MOOC Decision Tree Classifier")
In [62]:
# Perform Decision Tree Regressor
fit_and_evaluate(dataset, DecisionTreeRegressor, "MOOC Decision Tree Regressor")
In [59]:
# Perform Gaussian Naive Bayes
fit_and_evaluate(dataset, GaussianNB, "MOOC Gaussian Naive Bayes")
In [63]:
# Perform AdaBoost Classifier
fit_and_evaluate(dataset, AdaBoostClassifier, "MOOC AdaBoost Classifier")
In [46]:
# Perform SGD Classifier
fit_and_evaluate(dataset, SGDClassifier, "MOOC SGD Classifier")
In [ ]: