In [2]:
from numpy import newaxis
from scipy import stats
from seaborn import set as sns_set
from seaborn import FacetGrid, heatmap
from sklearn import grid_search
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from context import *
from util.dfmgmt import initSet, wrangle
from util.polarize import *
In [3]:
sns_set(style="whitegrid", font_scale=1)
%matplotlib inline
target = 'charted' # main feature to be predicted
bestParam = int(getBestParam('best_param_time.txt'))
In [4]:
df = initSet()
In [5]:
# Check head
df.head()
Out[5]:
In [6]:
df['sentiment_polarity'] = df.apply(sentiment, axis=1) # classify sentiment score
In [7]:
# Keep both year and decade
dropList = ['most_used_term', 'reading_score']
removeList = ['charted']
df_new, features = wrangle(df, dropList, removeList, True)
In [75]:
from sklearn import ensemble
# set X and y and build model
X = df_new[features]
y = df_new[target]
# Create separate training and test sets with 60/40 train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4, random_state=1000
)
# train model with classifier and parameter determined best
clf = ensemble.RandomForestClassifier(
n_estimators=bestParam, min_samples_leaf=2, max_depth=8
)
clf.fit(X_train, y_train)
Out[75]:
In [76]:
# Evaluate accuracy of model on test set
print "Accuracy of training set: %0.3f" % clf.score(X_train, y_train)
print "Accuracy of test set: %0.3f" % clf.score(X_test, y_test)
# Plot ROC curve and get AUC score
y_pred_proba = clf.predict_proba(X_test)[:,1]
# Determine the false positive and true positive rates
fpr, tpr, t = roc_curve(y_test, y_pred_proba)
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()
# Get ROC AUC score
print 'ROC AUC: %0.3f' % roc_auc_score(y_test, y_pred_proba)
In [77]:
# Get confusion matrix on test set
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, newaxis]
ax = plt.axes()
heatmap(cm_normalized, annot=True)
ax.set_ylabel('True')
ax.set_xlabel('Pred')
plt.show()
In [78]:
df_test = X_test
df_test['predict'] = y_pred
df_test[target] = y_test
In [79]:
df_test.tail(10)
Out[79]: