In [3]:
import pandas as pd
from pandas import *
In [3]:
# FIRST EXERCISE: Counting Number of Lines for Each File (Bookings and Searches)
In [4]:
# Initialized a counter and with the help of file handling, every line is read and counter is increased by 1
cnt_booking=0
with open("C:/amadeus/bookings.csv") as infile:
for line in infile:
cnt_booking=cnt_booking+1
print 'No of lines for Bookings: %d' % cnt_booking
In [5]:
# Same as above, number of lines are read by looping through rows in the file
cnt_searches=0
with open("C:/amadeus/searches.csv") as infile:
for line in infile:
cnt_searches=cnt_searches+1
print 'No of lines for Searches: %d' % cnt_searches
In [6]:
# Second exercise: top 10 arrival airports in the world in 2013 (using the bookings file)
In [7]:
# Displaying top rows
In [8]:
# Since file size is very big, it will be a good idea to read the file in chunk wise and understand the variables inside the file
df=pd.read_table('C:/amadeus/bookings.csv',sep='^',iterator=True,chunksize=50000)
In [9]:
# It displays the top 5 chunk rows from the bookings file
df.get_chunk(5)
Out[9]:
In [10]:
# File is loaded into the memory using Pandas dataframe.columns 'arr_port' and 'pax' are loaded
df=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','pax'])
In [11]:
# To find the total number of passengers at arrival airport, dataframe is grouped by arr_port by summing the passengers
# (including cancellations)
dfg=df.groupby('arr_port').sum()
print 'Top 10 arrival airports in the world in 2013'
dfg.sort(['pax'],ascending=[0]).head(10)
Out[11]:
In [12]:
# Finding City and Airport names with corresponding to airport
In [22]:
# Module Geobases is imported in to IPython Notebook and required database ('ori_por') is loaded
from GeoBases import GeoBase
geo_o = GeoBase(data='ori_por', verbose=False)
In [14]:
# Grouped dataframe is then sorted, index is reset to display all headers in one line and finally top 10 airports are considered
df_geo=dfg.sort(['pax'],ascending=[0]).head(10).add_suffix('_total').reset_index()
In [15]:
# Since there are white spaces in column headers file, it is required to trim the header before performing the analysis
In [16]:
df_geo['cleansed']=df_geo.arr_port.str.strip(' ')
df_geo.drop('arr_port', axis=1, inplace=True)
df_geo.rename(columns={'cleansed':'arr_port'}, inplace=True)
df_geo['Airport_Name']=df_geo['arr_port']
df_geo['City']=df_geo['arr_port']
In [17]:
# Using Geobases, City and Airport Names are mapped using arr_port as the identifier from the bookings file
In [18]:
i=0;
while i < df_geo.arr_port.count():
df_geo['Airport_Name'][i]=geo_o.get(df_geo['arr_port'][i], 'name')
df_geo['City'][i]=geo_o.get(df_geo['arr_port'][i], 'city_code')
i=i+1
df_geo=df_geo[['arr_port', 'pax_total', 'Airport_Name', 'City']]
In [19]:
print 'Top 10 arrival airports in the world in 2013 with Airport and City Names'
df_geo
Out[19]:
In [20]:
# Third exercise: plot the monthly number of searches for flights arriving at Málaga, Madrid or Barcelona
In [1]:
import matplotlib.pyplot as plt
import pylab
import numpy as np
%pylab inline
In [3]:
# Displaying the first two chunk rows to understand the data
df_search=pd.read_table('C:/amadeus/searches.csv',sep='^',iterator=True,chunksize=50000)
In [4]:
df_search.get_chunk(2)
Out[4]:
In [23]:
# Using geobases, iata codes are mapped to understand the city names
geo_o.get('AGP', 'name')
Out[23]:
In [24]:
geo_o.get('MAD', 'name')
Out[24]:
In [25]:
geo_o.get('BCN', 'name')
Out[25]:
In [4]:
# Dataframe is loaded with date and destination in to the memory
df_search=pd.read_table('C:/amadeus/searches.csv',sep='^',usecols=['Date','Destination'])
In [5]:
# Dataframe is further filtered for Barcelona, Madrid and Malaga
df2=df_search[df_search['Destination'].isin(['BCN', 'MAD', 'AGP'])]
In [6]:
# Counter is intialized and added as a separate column to get the total number of searches
df2['Searches']=1
In [7]:
# Below operation is performed to get the month from date field
df2['month'] = pd.DatetimeIndex(df2['Date']).month
In [8]:
#size of figure is initialzed for the plot
rcParams['figure.figsize'] = 12, 5
In [19]:
# dataframe is grouped by month and destination to get the total number of searches
df3= df2.groupby(['Destination','month']).sum().add_suffix('_total').reset_index()
In [20]:
df3.head()
Out[20]:
In [22]:
# Extracting dataframes for individual places
df_agp=df3[df3['Destination']=='AGP']
df_bcn=df3[df3['Destination']=='BCN']
df_mad=df3[df3['Destination']=='MAD']
In [30]:
# plotting the monthly searches for three cities using subplots
rcParams['figure.figsize'] = 15, 8
fig, axes = plt.subplots(nrows=2, ncols=2)
df_agp.groupby(['month']).sum().plot(ax=axes[0,0],title='Malaga',kind='bar',color='blue')
df_bcn.groupby(['month']).sum().plot(ax=axes[0,1],title='Spain',kind='bar',color='yellow')
df_mad.groupby(['month']).sum().plot(ax=axes[1,0],title='Madrid',kind='bar',color='grey')
Out[30]:
In [ ]:
In [44]:
# Bonus exercise: write a Web Service
In [45]:
# Web Service with Flask and Json for Top 10 arrival airports in the world in 2013
In [46]:
import json
In [46]:
# Datframe is loaded with bookings data
df_air=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','pax'])
In [47]:
# a function is defined to return the top n airports in json format
def top_airports(n):
dfg=df_air.groupby('arr_port').sum()
dfg2=dfg.sort(['pax'],ascending=[0]).head(int(n)).add_suffix('_total').reset_index()
df_json=dfg2.to_json(orient='index')
return df_json
In [48]:
# Flask is imported in to the ipython notebook to create a web service
from flask import Flask, url_for, Response
app = Flask(__name__)
@app.route('/api/v1/top_airports/<n>', methods = ['GET'])
def top_airport(n):
df_json=top_airports(n)
js = json.dumps(df_json)
resp = Response(df_json, status=200, mimetype='application/json')
return resp
if __name__ == '__main__':
app.run()
In [49]:
# Type the below URL in a Browser with N value. For example for top 20 airports, type as below
# http://localhost:5000/api/v1/top_airports/20
# Below output is returned in the Browser
# {"0":{"arr_port":"LHR ","pax_total":88809.0},"1":{"arr_port":"MCO ","pax_total":70930.0},"2":{"arr_port":"LAX
#","pax_total":70530.0},"3":{"arr_port":"LAS ","pax_total":69630.0},"4":{"arr_port":"JFK ","pax_total":66270.0},
#"5":{"arr_port":"CDG ","pax_total":64490.0},"6":{"arr_port":"BKK ","pax_total":59460.0},"7":{"arr_port":"MIA
#","pax_total":58150.0},"8":{"arr_port":"SFO ","pax_total":58000.0},"9":{"arr_port":"DXB ","pax_total":55590.0},
#"10":{"arr_port":"ORD ","pax_total":53200.0},"11":{"arr_port":"FCO ","pax_total":45120.0},"12":{"arr_port":"IST
#","pax_total":43850.0},"13":{"arr_port":"DFW ","pax_total":42420.0},"14":{"arr_port":"CUN ","pax_total":42000.0},"15"
#:{"arr_port":"MAD ","pax_total":41410.0},"16":{"arr_port":"BCN ","pax_total":41200.0},"17":{"arr_port":"LGA
#","pax_total":40712.0},"18":{"arr_port":"ATL ","pax_total":39780.0},"19":{"arr_port":"EWR ","pax_total":36890.0}}
In [47]:
# Bonus exercise: match searches with bookings including cancellations
In [4]:
df_bookings=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','dep_port'])
In [5]:
df_bookings['flag']=1
In [6]:
df_bookings['arr_port']=df_bookings.arr_port.str.strip(' ')
In [7]:
df_bookings['dep_port']=df_bookings.dep_port.str.strip(' ')
In [46]:
df_bookings.rename(columns={'arr_port':'Destination','dep_port':'Origin'}, inplace=True)
In [46]:
df_searches=pd.read_table('C:/amadeus/searches.csv',sep='^',usecols=['Date','Origin','Destination'])
In [46]:
df_merge = pd.merge(df_searches, df_bookings, how='left', on=['Origin','Destination'])
In [46]:
df_merge=df_merge.fillna(0)
In [47]:
df_merge.to_csv('Converted_Searches.csv')
In [49]:
# In the output, there will be extra column called flag, when flag=1(search converted to booking) ,flag=0 (search not converted to booking)
In [50]:
#--------------------------------------End of Exercises including Bonus Exercises--------------------------------------------------------