In [2]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
from mpl_toolkits.mplot3d import Axes3D
import scipy as sp
from scipy.interpolate import griddata
import plotly
from plotly.graph_objs import *
from json_io import list_from_json
plotly.offline.init_notebook_mode()
plt.style.use("ggplot")
twitterDatasetSize = 215977
twitterFeatureSize = 3145772
redditDatasetSize = 745376
redditFeatureSize = 12490143
In [8]:
real_elem = np.random.randint(0, 30000, 200)
real_elem[0] = 0
real_elem[1] = 30000
real_elem[2] = 0
real_elem[3] = 30000
real_feat = np.random.randint(0, 100000, 200)
real_feat[0] = 0
real_feat[1] = 100000
real_feat[2] = 100000
real_feat[3] = 0
real_scores = np.random.rand(len(real_elem))
for i in range(0, 4):
real_scores[i] = 0
print(real_elem.shape)
print(real_feat.shape)
print(real_scores.shape)
In [9]:
def graphHeatmap(numSamplesList, numTopFeaturesList, scoresList, interpMethod,
saGridMin=None, saGridMax=None, saGridStep=None, fGridMin=None, fGridMax=None, fGridStep=None):
"""
plot a heatmap visualization of classification results and their interpolation, where interpolation points
are determined by a bounded grid.
For example: saGridMin=0, saGridMax=5000, saGridStep=25, fGridMin=0, fGridMax=1000, fGridStep=10
Interpolation will occur at 200 * 100 = 20,000 points in a grid bounded by (0, 5000), (0, 1000) with samples step-size=25 and feature step-size=10
(0,0) , (0, 10) , (0, 20) , ... (0, 990)
(25,0) , (25, 10) , (25, 20) , ... (25, 990)
.
.
.
(4975,0), (4975,10), (4975,20), ... (4975, 990)
Parameters ------
numSamplesList: array-like
array where when flattened, ith element corresponds to number of samples used during ith train/test evaluation
numTopFeaturesList: array-like
array where when flattened, ith element corresponds to number of top features used during ith train/test evaluation
scoresList: array-like
array where when flattened, ith element corresponds to classifier score on dataset using numSamplesList[i] samples
and top numTopFeaturesList[i] features
interpMethod: string (optional)
one of "nearest", "linear", or "cubic". If not none, will perform 3D interpolation using scipy.interpolate.griddata with specified interpolation method
saGridMin: int
first point to interpolate samples at
saGridMax: int
last point to interpolate samples at
saGridStep: int
step-size for samples interpolation
fGridMin: int
first point to interpolate top features at
fGridMax: int
last point to interpolate top features at
fGridStep: int
step-size for top features interpolation
"""
flattenedSamples = np.ravel(numSamplesList)
flattenedFeats = np.ravel(numTopFeaturesList)
flattenedScores = np.ravel(scoresList)
maxSamples, maxFeats, maxScore = bestParams(flattenedSamples, flattenedFeats, flattenedScores)
gridSamples, gridFeats = np.mgrid[saGridMin:saGridMax:saGridStep, fGridMin:fGridMax:fGridStep]
gridScores = griddata((flattenedSamples, flattenedFeats), flattenedScores, (gridSamples, gridFeats), method=interpMethod)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(maxSamples, maxFeats, "ro", ms=5, label="Max accuracy: {} achieved w/ {} samples and top {} features".format(round(maxScore, 4), maxSamples, maxFeats))
heatmap = ax.pcolor(gridSamples, gridFeats, gridScores)
cBar = fig.colorbar(heatmap)
cBar.ax.set_title("accuracy", fontsize=10)
cBar.set_ticks(np.linspace(0, 1, 11))
ax.set_title("Top k features of n sample dataset vs. classification accuracy", fontsize=12, y=1.10)
ax.set_xlabel("# samples")
ax.set_ylabel("# top features")
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15))
plt.subplots_adjust(bottom=.3)
In [10]:
graphHeatmap(real_elem, real_feat, real_scores, interpMethod="linear", saGridMin=0, saGridMax=30000, saGridStep=300, fGridMin=0, fGridMax=100000, fGridStep=1000)
In [11]:
def graph3D(numSamplesList, numTopFeaturesList, scoresList, interpMethod="none",
saGridMin=None, saGridMax=None, saGridStep=None, fGridMin=None, fGridMax=None, fGridStep=None):
"""
plot a 3D visualization of classification results, and optionally an interpolated view where interpolation points
are determined by a bounded grid.
For example: saGridMin=0, saGridMax=5000, saGridStep=25, fGridMin=0, fGridMax=1000, fGridStep=10
Interpolation will occur at 200 * 100 = 20,000 points in a grid bounded by (0, 5000), (0, 1000) with samples step-size=25 and feature step-size=10
(0,0) , (0, 10) , (0, 20) , ... (0, 990)
(25,0) , (25, 10) , (25, 20) , ... (25, 990)
.
.
.
(4975,0), (4975,10), (4975,20), ... (4975, 990)
Parameters ------
numSamplesList: array-like
array where when flattened, ith element corresponds to number of samples used during ith train/test evaluation
numTopFeaturesList: array-like
array where when flattened, ith element corresponds to number of top features used during ith train/test evaluation
scoresList: array-like
array where when flattened, ith element corresponds to classifier score on dataset using numSamplesList[i] samples
and top numTopFeaturesList[i] features
interpMethod: string (optional)
one of "none", "nearest", "linear", or "cubic". If not none, will perform 3D interpolation using scipy.interpolate.griddata with specified interpolation method
saGridMin: int
first point to interpolate samples at
saGridMax: int
last point to interpolate samples at
saGridStep: int
step-size for samples interpolation
fGridMin: int
first point to interpolate top features at
fGridMax: int
last point to interpolate top features at
fGridStep: int
step-size for top features interpolation
"""
flattenedSamples = np.ravel(numSamplesList)
flattenedFeats = np.ravel(numTopFeaturesList)
flattenedScores = np.ravel(scoresList)
maxSamples, maxFeats, maxScore = bestParams(flattenedSamples, flattenedFeats, flattenedScores)
fig = plt.figure()
fig.suptitle("Top k features of n sample dataset vs. classification accuracy", fontsize=14)
if interpMethod != "none":
gridSamples, gridFeats = np.mgrid[saGridMin:saGridMax:saGridStep, fGridMin:fGridMax:fGridStep]
gridScores = griddata((flattenedSamples, flattenedFeats), flattenedScores, (gridSamples, gridFeats), method=interpMethod)
fig.set_size_inches(10,4)
ax = fig.add_subplot(121, projection='3d')
ax.scatter(flattenedSamples, flattenedFeats, flattenedScores, s=10, alpha=.5)
ax.scatter(maxSamples, maxFeats, maxScore, c='blue', s=15)
ax.set_xlabel("# samples")
ax.set_ylabel("# top features")
ax.set_zlabel("accuracy")
ax.tick_params(axis='both', which='major', labelsize=8)
ax = fig.add_subplot(122, projection='3d')
ax.scatter(gridSamples, gridFeats, gridScores, s=.5, alpha=.1)
ax.scatter(maxSamples, maxFeats, maxScore, c='blue', s=15, label="Max accuracy: {} achieved w/ {} samples and top {} features".format(round(maxScore, 4), maxSamples, maxFeats))
ax.set_xlabel("# samples")
ax.set_ylabel("# top features")
ax.set_zlabel("accuracy")
ax.set_title("Interpolated", fontsize=10, y=1.07)
ax.tick_params(axis='both', which='major', labelsize=8)
ax.legend(loc='upper center', bbox_to_anchor=(-0.15, -0.15))
plt.subplots_adjust(bottom=.2)
else:
ax = fig.add_subplot(111, projection='3d')
ax.scatter(flattenedSamples, flattenedFeats, flattenedScores, s=10, alpha=.5)
ax.scatter(maxSamples, maxFeats, maxScore, c='blue', s=15, label="Max accuracy: {} achieved w/ {} samples and top {} features".format(round(maxScore, 4), maxSamples, maxFeats))
ax.set_xlabel("# samples")
ax.set_ylabel("# top features")
ax.set_zlabel("accuracy")
ax.tick_params(axis='both', which='major', labelsize=8)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1))
plt.subplots_adjust(bottom=.2)
In [12]:
graph3D(real_elem, real_feat, real_scores, interpMethod="linear", saGridMin=0, saGridMax=30000, saGridStep=300, fGridMin=0, fGridMax=100000, fGridStep=1000)
In [3]:
def bestParams(numSamplesList, numTopFeaturesList, scoresList):
"""
return a 3-tuple (samples, topFeatures, score) corresponding to the parameters that resulted in highest score
Parameters ------
numSamplesList: array-like
array where when flattened, ith element corresponds to number of samples used during ith train/test evaluation
numTopFeaturesList: array-like
array where when flattened, ith element corresponds to number of top features used during ith train/test evaluation
scoresList: array-like
array where when flattened, ith element corresponds to classifier score on dataset using numSamplesList[i] samples
and top numTopFeaturesList[i] features
"""
flattenedSamples = np.ravel(numSamplesList)
flattenedFeat = np.ravel(numTopFeaturesList)
flattenedScores = np.ravel(scoresList)
assert(len(flattenedSamples) == len(flattenedFeat) and len(flattenedSamples) == len(flattenedScores)),\
"Size mismatch: numTopFeaturesList: length {}\tnumSamplesList: length {}\tscoresList: length {}"\
.format(len(flattenedSamples), len(flattenedFeat), len(flattenedScores))
argMax = np.argmax(flattenedScores)
return (flattenedSamples[argMax], flattenedFeat[argMax], flattenedScores[argMax])
In [23]:
def graph3DfromJSON(jsonPath, datasetSize, featuresSize, colorscale="Jet", scatter=False):
"""
3D visualization of classification results read from a JSON file
Parameters -------------
jsonPath: string
path to json file w/ classification results.
Expected format:
[
[# top features, % of dataset used, accuracy],
...
]
Ex: Two runs, first using top 10000 features of training set of size 50% of total dataset, 85% classification accuracy
second using top 20000 features of training set of size 80% of total dataset, 91% classification accuracy
[
[10000, .5, .85],
[20000, .8, .91]
]
datasetSize: int
total size of dataset
featuresSize: int
total number of features
colorscale: string
color scale for 3D visualization. See plotly documentation for supported colormaps
scatter: (optional, default=False) bool
plot actual data points overlayed on surface
"""
(feat, size, acc) = zip(*list_from_json(jsonPath))
# used for plotting actual data as scatter points
feat = [featuresSize if x == 0 else x for x in feat]
size = [datasetSize * x for x in size]
acc = np.array([x for x in acc])
best_samp, best_feat, best_acc = bestParams(size, feat, acc)
# used for plotting surface
feats = np.unique(feat)[::-1]
sizes = np.unique(size)
accs = np.reshape(acc, (len(feats), len(sizes)))
text = np.reshape(np.array(["feat: {} size: {} accuracy: {}".format(f, s, a) for (f, s, a) in zip(feat, size, acc)]), (len(feats), len(sizes)))
data = [
Surface(
x=sizes,
y=feats,
z=accs,
colorscale=colorscale,
hoverinfo="text",
text=text
),
Scatter3d(
x=[best_samp],
y=[best_feat],
z=[best_acc],
name="Best params",
marker = dict(
size = 10,
color = 'rgba(0, 255, 0, .8)',
),
hoverinfo="text",
text=["feat: {} size: {} accuracy: {}".format(best_feat, best_samp, best_acc)]
)
]
if scatter:
data.append(
Scatter3d(
x=size,
y=feat,
z=acc,
)
)
layout = Layout(
title='The relationship between total features, training size, and accuracy',
autosize=True,
width=850,
height=750,
scene=dict(
xaxis=dict(
title="training size",
type='log'),
yaxis=dict(
title="total features",
type='log'),
zaxis=dict(
title="accuracy",
range=[0, 1]
)
)
)
fig = Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [24]:
graph3DfromJSON("-twitter-reduction-trainsize-accuracy-mnbayes.json", twitterDatasetSize, twitterFeatureSize)
In [22]:
graph3DfromJSON("-reddit-reduction-trainsize-accuracy-mnbayes.json", redditDatasetSize, redditFeatureSize)
In [ ]: