In [125]:
import pandas as pd
import math
from numpy import nan
import re
import plfit
from matplotlib.ticker import NullFormatter
from pylab import scatter
import pylab
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
from scipy import stats
font = {'family' : 'Helvetica',
'size' : 16}
matplotlib.rc('font', **font)
data = pd.read_csv("/Users/jmccusker/src/rdflib/bioportal_benchmark-11_30-5.csv")
In [126]:
data['error'] = data['error'].apply(lambda x:
re.sub(".+?apikey=([0-9a-z]+[-]?)+","",x)
if isinstance(x,str) else x)
failures = data[data['error'].apply(type) == str]
successes = data[data['error'].apply(type) != str]
def get_log_bins(values, bin_count):
bins = [math.log10(min(values)),math.log10(max(values))]
bins = map(lambda x: (bins[1]-bins[0])*x/float(bin_count)+bins[0], range(bin_count))
bins = map(lambda x: 10**x, bins)
return bins
def plot_power_law(x, y, xlabel, ylabel, color="blue", **kwargs):
fig = plt.figure(1, figsize=(16, 16), dpi=150 )
nullfmt = NullFormatter() # no labels
left, width = 0.1, 0.9
bottom, height = 0.1, 0.9
bottom_h = left_h = left+width+0.02
rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom_h, width, 0.1]
rect_histy = [left_h, bottom, 0.1, height]
ax = plt.axes(rect_scatter)
ax.scatter(x ,y , c=color, s=64, alpha=0.75)
ax.set_yscale('log')
ax.set_xlabel(xlabel)
ax.set_xscale('log')
ax.set_ylabel(ylabel)
xx = x.apply(math.log10)
yy = y.apply(math.log10)
slope, intercept, r_value, p_value, slope_std_error = stats.linregress(xx, yy)
a = 10**intercept
b = slope
predict_y = intercept + slope * xx
pred_error = yy - predict_y
degrees_of_freedom = len(xx) - 2
residual_std_error = np.sqrt(np.sum(pred_error**2) / degrees_of_freedom)
pylab.plot(x, 10**predict_y, 'k-', label="Best fit")
axHistx = plt.axes(rect_histx)
axHisty = plt.axes(rect_histy)
axHistx.set_xscale('log')
axHistx.set_xlim( ax.get_xlim() )
axHistx.xaxis.set_major_formatter(nullfmt)
axHistx.hist(list(x), get_log_bins(x, 30), facecolor='green', alpha=0.75)
axHisty.set_yscale('log')
axHisty.set_ylim( ax.get_ylim() )
axHisty.yaxis.set_major_formatter(nullfmt)
axHisty.hist(list(y), get_log_bins(y, 30), facecolor='green', alpha=0.75, orientation='horizontal' )
#print get_log_bins(y, 50)
pylab.show()
print "r =", r_value
print "p =", p_value
print 10**intercept, slope
print "y = %.6f * x^%-.6f"%(a,b)
In [127]:
by_triples = data[data['triple_count'] > 0].sort(columns=['triple_count'])
color = by_triples.apply(lambda x: "red" if x['color_count'] == 0
else ("yellow" if x['tree_depth'] > 0
else 'blue'), axis=1)
plot_power_law(by_triples['triple_count'], by_triples['to_hash_runtime'], color=color,
xlabel="Size of graph (triples)", ylabel="Execution Time (s)")
In [122]:
has_bnodes = data[data['color_count'] > 0]
has_bnodes = has_bnodes.sort(columns=["color_count"])
color = has_bnodes.apply(lambda x: "yellow" if x['tree_depth'] > 0 else 'blue', axis=1)
plot_power_law(has_bnodes['color_count'], has_bnodes['to_hash_runtime'], color=color,
xlabel = "Final color count (nodes)", ylabel="Execution Time (s)")
In [123]:
has_bnodes = data[data['color_count'] > 0]
has_bnodes = has_bnodes.sort(columns=["color_count"])
color = has_bnodes.apply(lambda x: "yellow" if x['tree_depth'] > 0 else 'blue', axis=1)
plot_power_law(has_bnodes['adjacent_nodes'], has_bnodes['to_hash_runtime'], color=color,
xlabel = "BNode-adjacent nodes", ylabel="Execution Time (s)")
In [128]:
has_bnodes = data[data['color_count'] > 0]
has_bnodes = has_bnodes.sort(columns=["color_count"])
color = has_bnodes.apply(lambda x: "yellow" if x['tree_depth'] > 0 else 'blue', axis=1)
plot_power_law(has_bnodes['initial_color_count'], has_bnodes['initial_coloring_runtime'], color=color,
xlabel = "# Initial Colors", ylabel="Initial Coloring Time (s)")
In [82]:
import matplotlib.mlab as mlab
x = list(by_triples['triple_count'])
fig = plt.figure()
ax = fig.add_subplot(111)
bins = [math.log10(min(x)),math.log10(max(x))]
print bins
bins = map(lambda x: (bins[1]-bins[0])*x/50.0+bins[0], range(50))
print bins
bins = map(lambda x: 10**x, bins)
print bins
ax.set_xscale('log')
# the histogram of the data
n, bins, patches = ax.hist(x, bins, facecolor='green', alpha=0.75)
# hist uses np.histogram under the hood to create 'n' and 'bins'.
# np.histogram returns the bin edges, so there will be 50 probability
# density values in n, 51 bin edges in bins and 50 patches. To get
# everything lined up, we'll compute the bin centers
bincenters = 0.5*(bins[1:]+bins[:-1])
# add a 'best fit' line for the normal PDF
ax.set_xlabel('Triples')
ax.set_ylabel('Probability')
#ax.set_title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=100,\ \sigma=15$')
#ax.set_xlim(40, 160)
#ax.set_ylim(0, 0.03)
ax.grid(True)
plt.show()
In [32]:
has_bnodes['initial_color_count']
Out[32]:
In [49]:
range
Out[49]:
In [83]:
len(successes)
Out[83]:
In [ ]: