Import some stuff


In [1]:
from lib import MongoBackend
from matplotlib import pyplot as plt
from matplotlib.pyplot import show
backend = MongoBackend('master')

In [ ]:
res = backend.query("mahout_kmeans_text", ({'minDF':110, 'k':10}, {'input_size':1, 'output_size':1}))
print len(res)

Simple Example: input vs output size


In [4]:
query = ({'minDF':110, 'k':10}, {'input_size':1, 'output_size':1})
backend.plot_query("mahout_kmeans_text", query, title="Input vs Output size", ylabel='size(bytes)', xlabel='size(bytes)')

Mahout K-means, multi-K


In [5]:
plt.title("Mahout Kmeans text"); plt.ylabel('time(sec)'), plt.xlabel('#documents')
query = (({'minDF':110, 'k':-1}, {'documents':1, 'time':1}))
for k in range(5,21,5):
    query[0]['k']=k
    backend.plot_query("mahout_kmeans_text", query,label='k=%d'%k, show_plot=False)
show()

In [19]:


In [11]:
query = (({'minDF':110, 'k':10}, {'input_size':1, 'time':1}))
plt.title("Mahout Kmeans text"); plt.ylabel('time(sec)'), plt.xlabel('size (bytes)')
for k in range(5,21,5):
    query[0]['k']=k
    backend.plot_query("mahout_kmeans_text", query, label='k=%d'%k, show_plot=False)
plt.show()

In [ ]:
query = (({'k':-1}, {'documents':1, 'time':1}))
plt.title("Weka Kmeans text"); plt.ylabel('time(sec)'), plt.xlabel('# documents')
for k in [2, 5,10,20]:
    query[0]['k']=k
    backend.plot_query("weka_kmeans_comp", query, label='k=%d'%k, show_plot=False)
plt.show()

In [ ]:
query = (({'k':-1}, {'documents':1, 'time':1}))
plt.title("Cilk Kmeans text"); plt.ylabel('time(sec)'), plt.xlabel('# documents')
for k in [2, 5,10,20]:
    query[0]['k']=k
    backend.plot_query("cilk_kmeans_comp", query, label='k=%d'%k, show_plot=False)
plt.show()

In [ ]:
### Spark K-means, multi-K

In [20]:
plt.title("Spark Kmeans text"); plt.ylabel('time(sec)'), plt.xlabel('#documents')
query = (({'minDF':10, 'k':-1}, {'documents':1, 'time':1}))
for k in range(5,21,5):
    query[0]['k']=k
    backend.plot_query("spark_kmeans_text", query,label='k=%d'%k, show_plot=False)
show()

In [21]:
### Cilk

In [ ]:
plt.title("Cilk K-Means (mindDF:110)"); plt.ylabel('time(sec)'), plt.xlabel('#documents')
query = (({'minDF':160, 'k':-1}, {'documents':1, 'time':1}))
for k in range(5,21,5):
    query[0]['k']=k
    backend.plot_query("cilk_kmeans", query,label='k=%d'%k, show_plot=False)
show()

In [2]:
plt.title("Cilk K-Means (k:10)"); plt.ylabel('time(sec)'), plt.xlabel('#documents')
query = (({'minDF':1, 'k':10}, {'documents':1, 'time':1}))
for minDF in range(10,161,50):
    query[0]['minDF']=minDF
    backend.plot_query("cilk_kmeans", query,label='minDF=%d'%minDF, show_plot=False)
show()

Streaming K-Means


In [2]:
plt.title("Spark Stream. K-Means"); plt.ylabel('time(sec)'), plt.xlabel('#entries')
for k in (2,5,10,15,20):
    query = (({'k':k}, {'lines':1, 'time':1}))
    backend.plot_query("streaming_kmeans", query,label='K=%d'%k, show_plot=False)
show()

In [ ]:
lt.title("Spark Stream. K-Means"); plt.ylabel('time(sec)'), plt.xlabel('#entries')
for k in (2,5,10,15,20):
    query = (({'k':k}, {'lines':1, 'time':1}))
    backend.plot_query("streaming_kmeans", query,label='K=%d'%k, show_plot=False)
show()