In [146]:
%matplotlib inline
In [215]:
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import cross_validation
from sklearn import grid_search
results = pickle.load(open("results-25perc-noloc.pickle", "rb"))
model_results = results[0]
df = results[1]
predictions = results[2]
best_score = results[3]
best_params = results[4]
best_features = results[5]
best_model = results[6]
print "Loaded"
In [217]:
print "Best score:", best_score
#print "Best features:", best_features
print "Best params:", best_params
Build a histogram with percentages correct for each category
In [218]:
df_test = df[(df["is_test"] == True)]
df_test["prediction"] = predictions
#print df_test.head()
# Compare the percent correct to the results from earlier to make sure things are lined up right
print "Calculated accuracy:", sum(df_test["label"] == df_test["prediction"]) / float(len(df_test))
print "Model accuracy:", best_score
In [219]:
df_correct = df_test[(df_test["label"] == df_test["prediction"])]
df_incorrect = df_test[(df_test["label"] != df_test["prediction"])]
#df_correct.describe()
#df_test.describe()
#plt.hist(correct_labels)
#print df.describe()
print "Correct predictions:", df_correct.groupby(["label"])["prediction"].count()
print "Incorrect predictions:", df_incorrect.groupby(["label"])["prediction"].count()
Stats of text length for correct and incorrect
In [220]:
print df_correct.describe()
print df_incorrect.describe()
In [221]:
#print model_results
d3_data = {}
for m in model_results:
d3_data[m["feat_name"]] = {}
d3_data[m["feat_name"]]["C"] = []
d3_data[m["feat_name"]]["G"] = []
d3_data[m["feat_name"]]["S"] = []
#print m["feat_name"], m["model_params"], m["model_score"]
for s in m["grid_scores"]:
d3_data[m["feat_name"]]["C"].append(s[0]["C"])
d3_data[m["feat_name"]]["G"].append(s[0]["gamma"])
d3_data[m["feat_name"]]["S"].append(s[1])
#print d3_data
In [222]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from matplotlib import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
def d3_plot(X, Y, Z):
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.set_xlabel("C", weight="bold", size="xx-large")
ax.set_xticks([0, 5000, 10000, 15000])
ax.set_xlim(0, max(X))
ax.set_ylabel("gamma", weight="bold", size="xx-large")
ax.set_yticks([0, 1.5, 3, 4.5])
ax.set_ylim(0, max(Y))
ax.set_zlabel("Accuracy", weight="bold", size="xx-large")
#ax.set_zticks([0.5, 0.6, 0.70])
ax.set_zlim(0.5, 0.75)
ax.scatter(X, Y, Z, c='b', marker='o')
ax.zaxis.set_major_locator(LinearLocator(10))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
plt.show()
In [224]:
d3_plot(np.array(d3_data["area"]["C"]), np.array(d3_data["area"]["G"]), np.array(d3_data["area"]["S"]))
In [225]:
d3_plot(np.array(d3_data["line"]["C"]), np.array(d3_data["line"]["G"]), np.array(d3_data["line"]["S"]))
In [226]:
d3_plot(np.array(d3_data["word"]["C"]), np.array(d3_data["word"]["G"]), np.array(d3_data["word"]["S"]))
In [ ]:
In [ ]:
In [ ]: