In [13]:
%pylab inline
import os
import seaborn 
import matplotlib.pyplot as plt
import natto
import IPython
import numpy
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import * 
from collections import namedtuple
pylab.rcParams['figure.figsize'] = (10.0, 8.0)


Populating the interactive namespace from numpy and matplotlib

In [14]:
preped_data= sc.textFile("/user/root/analytic_out/part-r-00000")
hdrdata = ['idtabbYEAR',  'NETWORK', 'LABOREXP', 'STAFF', 'ELECEXP', 'KWH', 'TOTCOST', 'CAPITAL',  'LABOR',  'ELEC']
schema   = namedtuple('Flight', hdrdata)

In [15]:
def parse(row):

    return schema(*row[:11])

In [16]:
def mapperCAPITAL(value):
    result_list = []
   
    result_list.append(value['CAPITAL'] )
    return result_list

In [17]:
def mapper(value):
    result_list = []
   
    result_list.append(value)
    return result_list

In [18]:
print preped_data.take(1) 
display( preped_data.map( lambda x: (len(x.split("|")),1) ).countByKey() )


# id, YEAR, NETWORK, LABOREXP, STAFF, ELECEXP, KWH, TOTCOST, LABOR,  ELEC,  CAPITAL
# 1,  90,  13107,     1061,    16,     163,     1631,  2536,  3709.715, 0,   41.83754
okdata = preped_data.map( lambda x: x.split("|")).filter( lambda x: len(x) == 10 )
print okdata.count(), len(hdrdata)
# set it up as a dictionary
semidata = okdata.map( lambda x: dict( zip( hdrdata, x )))
display(semidata.take(1)[0])
# set it up as a named tuple
print schema(*okdata.take(3)[1])
print

# work with spark rdd's
result = semidata.map(mapper).take(4)
print result
print
print

result = semidata.map(mapperCAPITAL).take(8)

ct = result[4]
print ct


[u'id\tYEAR|NETWORK|LABOREXP|STAFF|ELECEXP|KWH|TOTCOST|LABOR|ELEC|CAPITAL']
defaultdict(int, {10: 605})
605 10
{'CAPITAL': u'LABOR',
 'ELEC': u'CAPITAL',
 'ELECEXP': u'ELECEXP',
 'KWH': u'KWH',
 'LABOR': u'ELEC',
 'LABOREXP': u'LABOREXP',
 'NETWORK': u'NETWORK',
 'STAFF': u'STAFF',
 'TOTCOST': u'TOTCOST',
 'idtabbYEAR': u'id\tYEAR'}
Flight(idtabbYEAR=u'1\t92', NETWORK=u'7351', LABOREXP=u'1145', STAFF=u'17', ELECEXP=u'196', KWH=u'2000', TOTCOST=u'2846', CAPITAL=u'40.2319', LABOR=u'6.886859', ELEC=u'52.88124')

[[{'ELEC': u'CAPITAL', 'NETWORK': u'NETWORK', 'STAFF': u'STAFF', 'KWH': u'KWH', 'ELECEXP': u'ELECEXP', 'LABOREXP': u'LABOREXP', 'CAPITAL': u'LABOR', 'LABOR': u'ELEC', 'TOTCOST': u'TOTCOST', 'idtabbYEAR': u'id\tYEAR'}], [{'ELEC': u'52.88124', 'NETWORK': u'7351', 'STAFF': u'17', 'KWH': u'2000', 'ELECEXP': u'196', 'LABOREXP': u'1145', 'CAPITAL': u'40.2319', 'LABOR': u'6.886859', 'TOTCOST': u'2846', 'idtabbYEAR': u'1\t92'}], [{'ELEC': u'49.96203', 'NETWORK': u'13107', 'STAFF': u'17', 'KWH': u'1428', 'ELECEXP': u'170', 'LABOREXP': u'1148', 'CAPITAL': u'43.5839', 'LABOR': u'6.454062', 'TOTCOST': u'2634', 'idtabbYEAR': u'1\t91'}], [{'ELEC': u'51.73502', 'NETWORK': u'13107', 'STAFF': u'16', 'KWH': u'1631', 'ELECEXP': u'163', 'LABOREXP': u'1061', 'CAPITAL': u'41.83754', 'LABOR': u'6.427445', 'TOTCOST': u'2536', 'idtabbYEAR': u'1\t90'}]]


[u'35.11556']