In [223]:
%pylab inline
from pyspark.sql import SQLContext
from pyspark.sql.types import *
import seaborn
import matplotlib.pyplot as plt
import IPython
import numpy
In [224]:
sqlContext = SQLContext(sc)
dataFile = sc.textFile("/user/root/analytic_out/part-r-00000")
header = dataFile.first()
fields = [StructField(field_name, StringType(), True) for field_name in header.split('|')]
fields[0].dataType = StringType()
fields[1].dataType = FloatType()
fields[2].dataType = FloatType()
fields[3].dataType = FloatType()
fields[4].dataType = FloatType()
fields[5].dataType = FloatType()
fields[6].dataType = FloatType()
fields[7].dataType = FloatType()
fields[8].dataType = FloatType()
fields[9].dataType = FloatType()
schema = StructType(fields)
dataHeader = dataFile.filter(lambda l: "id" in l)
dataHeader.collect()
dataNoHeader = dataFile.subtract(dataHeader)
data_temp = dataNoHeader.map(lambda k: k.split("|")).map(lambda p: ( p[0], float(p[1]), float(p[2]), float(p[3]), float(p[4]), float(p[5]), float(p[6]), float(p[7]), float(p[8]), float(p[9]) )
)
data_df = sqlContext.createDataFrame(data_temp, schema)
data_df.printSchema()
data_df.registerTempTable("data")
In [225]:
counts = sqlContext.sql( """select `NETWORK`, `TOTCOST`, count(*) from data group by `NETWORK`, `TOTCOST` """).collect()
# fetch distinct categories of network and totalcosts
n_types = sqlContext.sql("select distinct `NETWORK` from data").rdd.map(lambda x:x[0]).collect()
totalcosts = sqlContext.sql("select distinct `TOTCOST` from data").rdd.map(lambda x:x[0]).collect()
print n_types
nt = n_types[7]
print nt
# retrieve the aggregates for the selected network type, visualize as pie-chart
totalcosts = {totalcost:cnt for n_type, totalcost, cnt in counts if n_type==nt}
labels = sorted(totalcosts.keys())
sizes = [totalcosts[key] for key in labels]
pcolors = seaborn.color_palette( "husl",n_colors = len(labels) )
plt.figure(figsize=(12,12))
plt.pie(sizes, labels=labels, colors=pcolors)
plt.title(nt)
# below we see the distinct networks, the pie is the total for all the totalcosts for the selected network
Out[225]:
In [228]:
plot_data = counts[:10]
plot_data
# rows = sorted(list(set([x[0] for x in counts])))
# cols = sorted(list(set([x[1] for x in counts])))
lplot = pylab.plot([x[0] for x in plot_data])
In [ ]: