In [1]:
from numpy import newaxis
from scipy import stats
from seaborn import set as sns_set
from seaborn import FacetGrid, heatmap
from sklearn import grid_search
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from context import *
from util.dfmgmt import initSet, wrangle, dropOutliers
from util.polarize import *
In [2]:
sns_set(style="whitegrid", font_scale=1)
%matplotlib inline
target = 'charted' # main feature to be predicted
bestParam = float(getBestParam('best_param_no_time.txt'))
In [3]:
df = initSet()
df = df[df['decade'] != 2010]
In [4]:
# Check head
df.head()
Out[4]:
In [5]:
df['sentiment_polarity'] = df.apply(sentiment, axis=1) # classify sentiment score
In [6]:
# Drop both year and decade
dropList = ['most_used_term', 'reading_score', 'sentiment',
'year', 'decade']
removeList = ['charted']
df_new, features = wrangle(df, dropList, removeList, True)
df_new.head()
Out[6]:
In [7]:
from sklearn import linear_model
# set X and y and build model
X = df_new[features]
y = df_new[target]
# Create separate training and test sets with 60/40 train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4, random_state=1000
)
# train model with classifier and parameter determined best
clf = linear_model.LogisticRegression(C=bestParam)
clf.fit(X_train, y_train)
Out[7]:
In [8]:
# Evaluate accuracy of model on test set
print "Accuracy of training set: %0.3f" % clf.score(X_train, y_train)
print "Accuracy of test set: %0.3f" % clf.score(X_test, y_test)
# Plot ROC curve and get AUC score
y_pred_proba = clf.predict_proba(X_test)[:,1]
# Determine the false positive and true positive rates
fpr, tpr, t = roc_curve(y_test, y_pred_proba)
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()
# Get ROC AUC score
print 'ROC AUC: %0.3f' % roc_auc_score(y_test, y_pred_proba)
In [9]:
# Get confusion matrix on test set
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, newaxis]
ax = plt.axes()
heatmap(cm_normalized, annot=True)
ax.set_ylabel('True')
ax.set_xlabel('Pred')
plt.show()
In [10]:
df_test = X_test
df_test['predict'] = y_pred
df_test[target] = y_test
In [11]:
df_test.tail(10)
Out[11]:
In [15]:
plt.figure(figsize=(12,10))
g = FacetGrid(df_test, hue='predict', col='sentiment_polarity', size=4)
g.map(plt.scatter, 'density_raw', 'verbs', alpha=0.6)
g.add_legend()
plt.figure(figsize=(12,10))
x = FacetGrid(df_test, hue='charted', col='sentiment_polarity', size=4)
x.map(plt.scatter, 'density_raw', 'verbs', alpha=0.6)
x.add_legend()
plt.show()
In [ ]: