In [ ]:

    
print "hello"



In [ ]:

    
x = 3



In [ ]:

    
x



In [ ]:

    
x = x + 1



In [ ]:

    
birdstrikes = sc.textFile("file:///home/bigdata/training/datasets/birdstrikes.csv")



In [ ]:

    
birdstrikes.count()



In [ ]:

    
birdstrikes.first()



In [ ]:

    
birdstrikes.take(2)

Partitioning



In [ ]:

    
birdstrikes.getNumPartitions()



In [ ]:

    
birdstrikes.repartition(10)



In [ ]:

    
birdstrikes.getNumPartitions()

Uh oh. (Immutability)



In [ ]:

    
rdd = birdstrikes.repartition(10)



In [ ]:

    
rdd.getNumPartitions()



In [ ]:

    
rdd.first()



In [ ]:

    
birdstrikes.first()

lambda functions



In [ ]:

    
def is_not_header(row):
    if row.startswith("id,"):
        return False
    else:
        return True



In [ ]:

    
rows = birdstrikes.filter(is_not_header)



In [ ]:

    
rows.take(2)



In [ ]:

    
def is_not_header(row):
    return not row.startswith("id,")



In [ ]:

    
rows = birdstrikes.filter(is_not_header)



In [ ]:

    
rows.take(5)



In [ ]:

    
rows = birdstrikes.filter(lambda row: not row.startswith("id,"))



In [ ]:

    
rows.take(5)

Pipelines



In [ ]:

    
s = "abc"
"a" in s



In [ ]:

    
"d" in s



In [ ]:

    
"d" not in s



In [ ]:

    
rows.filter(lambda row: "Airplane" not in row).filter(lambda row: "Medium" not in row).take(2)



In [ ]:

    
# Exercise: the the number of rows where it is not United Airlines and not in Ohio



In [ ]:

    
rows.filter(lambda row: "UNITED AIRLINES" not in row).filter(lambda row: "Ohio" not in row).count()



In [ ]:

    
# Exercise: The number where incident not in Ohio and By a Medium bird



In [ ]:

    
rows.filter(lambda row: "Ohio" not in row).filter(lambda row: "Medium" in row).count()



In [ ]:

    
# Exercise: The number of incidents not in Ohio and neither in California



In [ ]:

    
rows.filter(lambda row: "Ohio" not in row).filter(lambda row: "California" not in row).count()

Working with lists



In [ ]:

    
string = "1,2,3"



In [ ]:

    
string.split(",")



In [ ]:

    
l = string.split(",")



In [ ]:

    
l[0]



In [ ]:

    
l[1]



In [ ]:

    
lists = rows.map(lambda row: row.split(","))



In [ ]:

    
[ l[1], l[2], l[0] ]



In [ ]:

    
states = lists.map(lambda item: [ item[5], item[9] ])



In [ ]:

    
# Excercise: take 5 of those, where the state is not Colorado



In [ ]:

    
states.filter(lambda row: row[0] != "Colorado").take(5)



In [ ]:

    
#Excercise: create a new varialbe cleanstates and put in those which are not empty



In [ ]:

    
cleanstates = states.filter(lambda row: row[0] != '').filter(lambda row: row[1] != '')



In [ ]:

    
finalrdd = cleanstates.map(lambda l: [l[0], int(l[1])])



In [ ]:

    
finalrdd.take(5)



In [ ]:

    
#Sum cost
finalrdd.map(lambda l: l[1]).reduce(lambda x, y: x+y)



In [ ]:

    
df = states.toDF(["state","cost"])



In [ ]:

    
df.toPandas().head()



In [ ]:

    
df.registerTempTable("incidents")



In [ ]:

    
sqlContext.sql("SELECT * FROM incidents LIMIT 10").toPandas()



In [ ]:

    
sqlContext.sql("SELECT state, SUM(cost) as total_costs FROM incidents GROUP BY state LIMIT 10").toPandas()



In [ ]:

    
#Excercise: Select the total average (AVG) cost of incidents by bird size



In [ ]: