In [ ]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrame
from pyspark.sql import Row
from pyspark.sql.types import *
import pandas as pd
import StringIO
import matplotlib.pyplot as plt
hc = sc._jsc.hadoopConfiguration()
hc.set("hive.execution.engine", "mr")

working_storage = 'WORKING_STORAGE'
output_directory = 'jupyter/py2'
protocol_name = 'PROTOCOL_NAME://'

Load Carriers data


In [ ]:
carriers = sqlContext.read.parquet(protocol_name + working_storage + "/" + output_directory + "/carriers").cache()   
sqlContext.registerDataFrameAsTable(carriers, "carriers")
carriers.printSchema()
carriers.limit(20).toPandas()

Load Airports data


In [ ]:
airports = sqlContext.read.parquet(protocol_name + working_storage + "/" + output_directory + "/airports").cache()
sqlContext.registerDataFrameAsTable(airports, "airports")
airports.printSchema()
airports.limit(20).toPandas()

Load Flights data


In [ ]:
flights = sqlContext.read.parquet(protocol_name + working_storage + "/" + output_directory + "/flights").cache()
flights.printSchema()
sqlContext.registerDataFrameAsTable(flights, "flights")
flights.limit(10).toPandas()[["ArrDelay","CarrierDelay","CarrierDelayStr","WeatherDelay","WeatherDelayStr","Distance"]]

Taxonomy for ArrDelay, CarrierDelay, and Distance colums


In [ ]:
flights.describe("ArrDelay","CarrierDelay","Distance").toPandas()

Let's find the top 10 of the most unpunctual airlines


In [ ]:
%matplotlib inline
import matplotlib 
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')


delay = sqlContext.sql("select SUBSTR(c.description, 0, 15) as Carrier, WorkDayDelay, WeekendDelay from \
   (select ceil( avg(f.ArrDelay + f.DepDelay) ) as WorkDayDelay, \
   f.UniqueCarrier \
   FROM flights f \
         WHERE f.DayOfWeek < 6 \
         GROUP BY f.UniqueCarrier ORDER BY WorkDayDelay desc limit 10) t \
JOIN \
  (select ceil( avg(f.ArrDelay + f.DepDelay) ) as WeekendDelay, \
   f.UniqueCarrier \
   FROM flights f \
         WHERE f.DayOfWeek > 5 \
         GROUP BY f.UniqueCarrier) t1 \
ON t.UniqueCarrier = t1.UniqueCarrier \
JOIN carriers c on t.UniqueCarrier = c.code order by WeekendDelay desc, WorkDayDelay desc \
").toPandas()

color_range_days = ["#2966FF", "#61F2FF"]
delay["Average"] = (delay.WorkDayDelay + delay.WeekendDelay) / 2
ax = delay.Average.plot(x='Carrier', linestyle='-', marker='o')
delay.plot(x='Carrier', y=['WorkDayDelay','WeekendDelay'], kind='bar', legend = True,  figsize=(12, 4), color=color_range_days, ax=ax);

Number of flight performed by top companies


In [ ]:
%matplotlib inline
import matplotlib 
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

q = """SELECT t.cnt as FlightsAmt, carriers.description as Carrier FROM (
            SELECT count(*) as cnt, flights.UniqueCarrier as carrier_code 
                FROM flights GROUP BY flights.UniqueCarrier LIMIT 6) t 
            LEFT JOIN carriers ON t.carrier_code = carriers.code"""

topFlights = sqlContext.sql(q).toPandas()
topFlights.plot.pie(labels=topFlights["Carrier"], autopct='%.2f', legend=False, y="FlightsAmt", figsize=(12,12));

The average Flight Distance per Company


In [ ]:
%matplotlib inline
import matplotlib 
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

resultDistance = sqlContext.sql("SELECT SUBSTR(c.description, 0, 15) as Carrier, COUNT(Distance) AS Distance FROM flights f JOIN carriers c ON f.UniqueCarrier = c.code GROUP BY c.description ORDER BY distance DESC LIMIT 10").toPandas()

color_range =  ["#2966FF",
 	  "#2E73FF",
 	  "#3380FF",
 	  "#388CFF",
 	  "#3D99FF",
 	  "#42A6FF",
 	  "#47B2FF",
 	  "#4CBFFF",
 	  "#52CCFF",
 	  "#57D9FF",
 	  "#5CE6FF",
 	  "#61F2FF",
      "#66FFFF"]

resultDistance.plot(x='Carrier',  y='Distance', kind='bar', color=color_range, legend = False);

In [ ]: