In [ ]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrame
from pyspark.sql import Row
from pyspark.sql.types import *
import pandas as pd
import StringIO
import matplotlib.pyplot as plt
hc = sc._jsc.hadoopConfiguration()
hc.set("hive.execution.engine", "mr")
working_storage = 'WORKING_STORAGE'
output_directory = 'jupyter/py2'
protocol_name = 'PROTOCOL_NAME://'
In [ ]:
carriers = sqlContext.read.parquet(protocol_name + working_storage + "/" + output_directory + "/carriers").cache()
sqlContext.registerDataFrameAsTable(carriers, "carriers")
carriers.printSchema()
carriers.limit(20).toPandas()
In [ ]:
airports = sqlContext.read.parquet(protocol_name + working_storage + "/" + output_directory + "/airports").cache()
sqlContext.registerDataFrameAsTable(airports, "airports")
airports.printSchema()
airports.limit(20).toPandas()
In [ ]:
flights = sqlContext.read.parquet(protocol_name + working_storage + "/" + output_directory + "/flights").cache()
flights.printSchema()
sqlContext.registerDataFrameAsTable(flights, "flights")
flights.limit(10).toPandas()[["ArrDelay","CarrierDelay","CarrierDelayStr","WeatherDelay","WeatherDelayStr","Distance"]]
In [ ]:
flights.describe("ArrDelay","CarrierDelay","Distance").toPandas()
In [ ]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
delay = sqlContext.sql("select SUBSTR(c.description, 0, 15) as Carrier, WorkDayDelay, WeekendDelay from \
(select ceil( avg(f.ArrDelay + f.DepDelay) ) as WorkDayDelay, \
f.UniqueCarrier \
FROM flights f \
WHERE f.DayOfWeek < 6 \
GROUP BY f.UniqueCarrier ORDER BY WorkDayDelay desc limit 10) t \
JOIN \
(select ceil( avg(f.ArrDelay + f.DepDelay) ) as WeekendDelay, \
f.UniqueCarrier \
FROM flights f \
WHERE f.DayOfWeek > 5 \
GROUP BY f.UniqueCarrier) t1 \
ON t.UniqueCarrier = t1.UniqueCarrier \
JOIN carriers c on t.UniqueCarrier = c.code order by WeekendDelay desc, WorkDayDelay desc \
").toPandas()
color_range_days = ["#2966FF", "#61F2FF"]
delay["Average"] = (delay.WorkDayDelay + delay.WeekendDelay) / 2
ax = delay.Average.plot(x='Carrier', linestyle='-', marker='o')
delay.plot(x='Carrier', y=['WorkDayDelay','WeekendDelay'], kind='bar', legend = True, figsize=(12, 4), color=color_range_days, ax=ax);
In [ ]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
q = """SELECT t.cnt as FlightsAmt, carriers.description as Carrier FROM (
SELECT count(*) as cnt, flights.UniqueCarrier as carrier_code
FROM flights GROUP BY flights.UniqueCarrier LIMIT 6) t
LEFT JOIN carriers ON t.carrier_code = carriers.code"""
topFlights = sqlContext.sql(q).toPandas()
topFlights.plot.pie(labels=topFlights["Carrier"], autopct='%.2f', legend=False, y="FlightsAmt", figsize=(12,12));
In [ ]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
resultDistance = sqlContext.sql("SELECT SUBSTR(c.description, 0, 15) as Carrier, COUNT(Distance) AS Distance FROM flights f JOIN carriers c ON f.UniqueCarrier = c.code GROUP BY c.description ORDER BY distance DESC LIMIT 10").toPandas()
color_range = ["#2966FF",
"#2E73FF",
"#3380FF",
"#388CFF",
"#3D99FF",
"#42A6FF",
"#47B2FF",
"#4CBFFF",
"#52CCFF",
"#57D9FF",
"#5CE6FF",
"#61F2FF",
"#66FFFF"]
resultDistance.plot(x='Carrier', y='Distance', kind='bar', color=color_range, legend = False);
In [ ]: