In [ ]:
print "hello"
In [ ]:
x = 3
In [ ]:
x
In [ ]:
x = x + 1
In [ ]:
birdstrikes = sc.textFile("file:///home/bigdata/training/datasets/birdstrikes.csv")
In [ ]:
birdstrikes.count()
In [ ]:
birdstrikes.first()
In [ ]:
birdstrikes.take(2)
Partitioning
In [ ]:
birdstrikes.getNumPartitions()
In [ ]:
birdstrikes.repartition(10)
In [ ]:
birdstrikes.getNumPartitions()
Uh oh. (Immutability)
In [ ]:
rdd = birdstrikes.repartition(10)
In [ ]:
rdd.getNumPartitions()
In [ ]:
rdd.first()
In [ ]:
birdstrikes.first()
lambda functions
In [ ]:
def is_not_header(row):
if row.startswith("id,"):
return False
else:
return True
In [ ]:
rows = birdstrikes.filter(is_not_header)
In [ ]:
rows.take(2)
In [ ]:
def is_not_header(row):
return not row.startswith("id,")
In [ ]:
rows = birdstrikes.filter(is_not_header)
In [ ]:
rows.take(5)
In [ ]:
rows = birdstrikes.filter(lambda row: not row.startswith("id,"))
In [ ]:
rows.take(5)
Pipelines
In [ ]:
s = "abc"
"a" in s
In [ ]:
"d" in s
In [ ]:
"d" not in s
In [ ]:
rows.filter(lambda row: "Airplane" not in row).filter(lambda row: "Medium" not in row).take(2)
In [ ]:
# Exercise: the the number of rows where it is not United Airlines and not in Ohio
In [ ]:
rows.filter(lambda row: "UNITED AIRLINES" not in row).filter(lambda row: "Ohio" not in row).count()
In [ ]:
# Exercise: The number where incident not in Ohio and By a Medium bird
In [ ]:
rows.filter(lambda row: "Ohio" not in row).filter(lambda row: "Medium" in row).count()
In [ ]:
# Exercise: The number of incidents not in Ohio and neither in California
In [ ]:
rows.filter(lambda row: "Ohio" not in row).filter(lambda row: "California" not in row).count()
Working with lists
In [ ]:
string = "1,2,3"
In [ ]:
string.split(",")
In [ ]:
l = string.split(",")
In [ ]:
l[0]
In [ ]:
l[1]
In [ ]:
lists = rows.map(lambda row: row.split(","))
In [ ]:
[ l[1], l[2], l[0] ]
In [ ]:
states = lists.map(lambda item: [ item[5], item[9] ])
In [ ]:
# Excercise: take 5 of those, where the state is not Colorado
In [ ]:
states.filter(lambda row: row[0] != "Colorado").take(5)
In [ ]:
#Excercise: create a new varialbe cleanstates and put in those which are not empty
In [ ]:
cleanstates = states.filter(lambda row: row[0] != '').filter(lambda row: row[1] != '')
In [ ]:
finalrdd = cleanstates.map(lambda l: [l[0], int(l[1])])
In [ ]:
finalrdd.take(5)
In [ ]:
#Sum cost
finalrdd.map(lambda l: l[1]).reduce(lambda x, y: x+y)
In [ ]:
df = states.toDF(["state","cost"])
In [ ]:
df.toPandas().head()
In [ ]:
df.registerTempTable("incidents")
In [ ]:
sqlContext.sql("SELECT * FROM incidents LIMIT 10").toPandas()
In [ ]:
sqlContext.sql("SELECT state, SUM(cost) as total_costs FROM incidents GROUP BY state LIMIT 10").toPandas()
In [ ]:
#Excercise: Select the total average (AVG) cost of incidents by bird size
In [ ]: