In [3]:
import pandas as pd
from pandas import *


C:\Users\M-SUDHAN\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\io\excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.
  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))

In [3]:
# FIRST EXERCISE: Counting Number of Lines for Each File (Bookings and Searches)

In [4]:
# Initialized a counter and with the help of file handling, every line is read and counter is increased by 1
cnt_booking=0
with open("C:/amadeus/bookings.csv") as infile:
    for line in infile:
        cnt_booking=cnt_booking+1
    print 'No of lines for Bookings: %d' % cnt_booking


No of lines for Bookings: 10000011

In [5]:
# Same as above, number of lines are read by looping through rows in the file
cnt_searches=0
with open("C:/amadeus/searches.csv") as infile:
    for line in infile:
        cnt_searches=cnt_searches+1
    print 'No of lines for Searches: %d' % cnt_searches


No of lines for Searches: 20390199

In [6]:
# Second exercise: top 10 arrival airports in the world in 2013 (using the bookings file)

In [7]:
# Displaying top rows

In [8]:
# Since file size is very big, it will be a good idea to read the file in chunk wise and understand the variables inside the file
df=pd.read_table('C:/amadeus/bookings.csv',sep='^',iterator=True,chunksize=50000)

In [9]:
# It displays the top 5 chunk rows from the bookings file
df.get_chunk(5)


Out[9]:
act_date source pos_ctry pos_iata pos_oid rloc cre_date duration distance dep_port ... route carrier bkg_class cab_class brd_time off_time pax year month oid
0 2013-03-05 00:00:00 1A DE a68dd7ae953c8acfb187a1af2dcbe123 1a11ae49fcbf545fd2afc1a24d88d2b7 ea65900e72d71f4626378e2ebd298267 2013-02-22 00:00:00 1708 0 ZRH ... LHRZRH VI T Y 2013-03-07 08:50:00 2013-03-07 11:33:37 -1 2013 3 NULL
1 2013-03-26 00:00:00 1A US e612b9eeeee6f17f42d9b0d3b79e75ca 7437560d8f276d6d05eeb806d9e7edee 737295a86982c941f1c2da9a46a14043 2013-03-26 00:00:00 135270 0 SAL ... SALATLCLT NV L Y 2013-04-12 13:04:00 2013-04-12 22:05:40 1 2013 3 NULL
2 2013-03-26 00:00:00 1A US e612b9eeeee6f17f42d9b0d3b79e75ca 7437560d8f276d6d05eeb806d9e7edee 737295a86982c941f1c2da9a46a14043 2013-03-26 00:00:00 135270 0 SAL ... CLTATLSAL NV U Y 2013-07-15 07:00:00 2013-07-15 11:34:51 1 2013 3 NULL
3 2013-03-26 00:00:00 1A AU 0f984b3bb6bd06661c95529bbd6193bc 36472c6dbaf7afec9136ac40364e2794 5ecf00fdcbcec761c43dc7285253d0c1 2013-03-26 00:00:00 30885 0 AKL ... AKLHKGSVO XK G Y 2013-04-24 23:59:00 2013-04-25 16:06:31 1 2013 3 SYDA82546
4 2013-03-26 00:00:00 1A AU 0f984b3bb6bd06661c95529bbd6193bc 36472c6dbaf7afec9136ac40364e2794 5ecf00fdcbcec761c43dc7285253d0c1 2013-03-26 00:00:00 30885 0 AKL ... SVOHKGAKL XK G Y 2013-05-14 20:15:00 2013-05-16 10:44:50 1 2013 3 SYDA82546

5 rows × 38 columns


In [10]:
# File is loaded into the memory using Pandas dataframe.columns 'arr_port' and 'pax' are loaded
df=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','pax'])

In [11]:
# To find the total number of passengers at arrival airport, dataframe is grouped by arr_port by summing the passengers
# (including cancellations)
dfg=df.groupby('arr_port').sum()
print 'Top 10 arrival airports in the world in 2013'
dfg.sort(['pax'],ascending=[0]).head(10)


Top 10 arrival airports in the world in 2013
Out[11]:
pax
arr_port
LHR 88809
MCO 70930
LAX 70530
LAS 69630
JFK 66270
CDG 64490
BKK 59460
MIA 58150
SFO 58000
DXB 55590

In [12]:
# Finding City and Airport names with corresponding to airport

In [22]:
# Module Geobases is imported in to IPython Notebook and required database ('ori_por') is loaded
from GeoBases import GeoBase
geo_o = GeoBase(data='ori_por', verbose=False)

In [14]:
# Grouped dataframe is then sorted, index is reset to display all headers in one line and finally top 10 airports are considered
df_geo=dfg.sort(['pax'],ascending=[0]).head(10).add_suffix('_total').reset_index()

In [15]:
# Since there are white spaces in column headers file, it is required to trim the header before performing the analysis

In [16]:
df_geo['cleansed']=df_geo.arr_port.str.strip(' ')
df_geo.drop('arr_port', axis=1, inplace=True)
df_geo.rename(columns={'cleansed':'arr_port'}, inplace=True)
df_geo['Airport_Name']=df_geo['arr_port']
df_geo['City']=df_geo['arr_port']

In [17]:
# Using Geobases, City and Airport Names are mapped using arr_port as the identifier from the bookings file

In [18]:
i=0;
while i < df_geo.arr_port.count():
    df_geo['Airport_Name'][i]=geo_o.get(df_geo['arr_port'][i], 'name')
    df_geo['City'][i]=geo_o.get(df_geo['arr_port'][i], 'city_code')
    i=i+1
df_geo=df_geo[['arr_port', 'pax_total', 'Airport_Name', 'City']]

In [19]:
print 'Top 10 arrival airports in the world in 2013 with Airport and City Names'
df_geo


Top 10 arrival airports in the world in 2013 with Airport and City Names
Out[19]:
arr_port pax_total Airport_Name City
0 LHR 88809 London Heathrow Airport LON
1 MCO 70930 Orlando International Airport ORL
2 LAX 70530 Los Angeles International Airport LAX
3 LAS 69630 Mc Carran International Airport LAS
4 JFK 66270 John F Kennedy International Airport NYC
5 CDG 64490 Paris - Charles-de-Gaulle PAR
6 BKK 59460 Suvarnabhumi BKK
7 MIA 58150 Miami International Airport MIA
8 SFO 58000 San Francisco International Airport SFO
9 DXB 55590 Dubai International Airport DXB

In [20]:
# Third exercise: plot the monthly number of searches for flights arriving at Málaga, Madrid or Barcelona

In [1]:
import matplotlib.pyplot as plt
import pylab
import numpy as np
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['pylab']
`%matplotlib` prevents importing * from pylab and numpy

In [3]:
# Displaying the first two chunk rows to understand the data
df_search=pd.read_table('C:/amadeus/searches.csv',sep='^',iterator=True,chunksize=50000)

In [4]:
df_search.get_chunk(2)


Out[4]:
Date Time TxnCode OfficeID Country Origin Destination RoundTrip NbSegments Seg1Departure ... Seg6Arrival Seg6Date Seg6Carrier Seg6BookingCode From IsPublishedForNeg IsFromInternet IsFromVista TerminalID InternetOffice
0 2013-01-01 20:25:57 MPT 624d8c3ac0b3a7ca03e3c167e0f48327 DE TXL AUH 1 2 TXL ... NaN NaN NaN NaN 1ASIWS 0 0 0 d41d8cd98f00b204e9800998ecf8427e FRA
1 2013-01-01 10:15:33 MPT b0af35b31588dc4ab06d5cf2986e8e02 MD ATH MIL 0 1 ATH ... NaN NaN NaN NaN 1ASIWS 0 0 0 d41d8cd98f00b204e9800998ecf8427e KIV

2 rows × 45 columns


In [23]:
# Using geobases, iata codes are mapped to understand the city names
geo_o.get('AGP', 'name')


Out[23]:
'M\xc3\xa1laga Airport'

In [24]:
geo_o.get('MAD', 'name')


Out[24]:
'Madrid / Barajas'

In [25]:
geo_o.get('BCN', 'name')


Out[25]:
'Barcelona\xe2\x80\x93El Prat Airport'

In [4]:
# Dataframe is loaded with date and destination in to the memory
df_search=pd.read_table('C:/amadeus/searches.csv',sep='^',usecols=['Date','Destination'])

In [5]:
# Dataframe is further filtered for Barcelona, Madrid and Malaga
df2=df_search[df_search['Destination'].isin(['BCN', 'MAD', 'AGP'])]

In [6]:
# Counter is intialized and added as a separate column to get the total number of searches
df2['Searches']=1

In [7]:
# Below operation is performed to get the month from date field
df2['month'] = pd.DatetimeIndex(df2['Date']).month

In [8]:
#size of figure is initialzed for the plot
rcParams['figure.figsize'] = 12, 5

In [19]:
# dataframe is grouped by month and destination to get the total number of searches
df3= df2.groupby(['Destination','month']).sum().add_suffix('_total').reset_index()

In [20]:
df3.head()


Out[20]:
Destination month Searches_total
0 AGP 1 9633
1 AGP 2 8379
2 AGP 3 10659
3 AGP 4 8265
4 AGP 5 10830

In [22]:
# Extracting dataframes for individual places
df_agp=df3[df3['Destination']=='AGP']
df_bcn=df3[df3['Destination']=='BCN']
df_mad=df3[df3['Destination']=='MAD']

In [30]:
# plotting the monthly searches for three cities using subplots
rcParams['figure.figsize'] = 15, 8
fig, axes = plt.subplots(nrows=2, ncols=2)

df_agp.groupby(['month']).sum().plot(ax=axes[0,0],title='Malaga',kind='bar',color='blue')
df_bcn.groupby(['month']).sum().plot(ax=axes[0,1],title='Spain',kind='bar',color='yellow')
df_mad.groupby(['month']).sum().plot(ax=axes[1,0],title='Madrid',kind='bar',color='grey')


Out[30]:
<matplotlib.axes.AxesSubplot at 0xc8109d0>

In [ ]:


In [44]:
# Bonus exercise: write a Web Service

In [45]:
# Web Service with Flask and Json for Top 10 arrival airports in the world in 2013

In [46]:
import json

In [46]:
# Datframe is loaded with bookings data
df_air=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','pax'])

In [47]:
# a function is defined to return the top n airports in json format
def top_airports(n):
    dfg=df_air.groupby('arr_port').sum()
    dfg2=dfg.sort(['pax'],ascending=[0]).head(int(n)).add_suffix('_total').reset_index()
    df_json=dfg2.to_json(orient='index')
    return df_json

In [48]:
# Flask is imported in to the ipython notebook to create a web service
from flask import Flask, url_for, Response
app = Flask(__name__)


@app.route('/api/v1/top_airports/<n>', methods = ['GET'])
def top_airport(n):
    df_json=top_airports(n)
    js = json.dumps(df_json)

    resp = Response(df_json, status=200, mimetype='application/json')
    
    return resp    

if __name__ == '__main__':
    app.run()

In [49]:
# Type the below URL in a Browser with N value. For example for top 20 airports, type as below
# http://localhost:5000/api/v1/top_airports/20

# Below output is returned in the Browser

# {"0":{"arr_port":"LHR     ","pax_total":88809.0},"1":{"arr_port":"MCO     ","pax_total":70930.0},"2":{"arr_port":"LAX    
#","pax_total":70530.0},"3":{"arr_port":"LAS     ","pax_total":69630.0},"4":{"arr_port":"JFK     ","pax_total":66270.0},
#"5":{"arr_port":"CDG     ","pax_total":64490.0},"6":{"arr_port":"BKK     ","pax_total":59460.0},"7":{"arr_port":"MIA     
#","pax_total":58150.0},"8":{"arr_port":"SFO     ","pax_total":58000.0},"9":{"arr_port":"DXB     ","pax_total":55590.0},
#"10":{"arr_port":"ORD     ","pax_total":53200.0},"11":{"arr_port":"FCO     ","pax_total":45120.0},"12":{"arr_port":"IST    
#","pax_total":43850.0},"13":{"arr_port":"DFW     ","pax_total":42420.0},"14":{"arr_port":"CUN     ","pax_total":42000.0},"15"
#:{"arr_port":"MAD     ","pax_total":41410.0},"16":{"arr_port":"BCN     ","pax_total":41200.0},"17":{"arr_port":"LGA    
#","pax_total":40712.0},"18":{"arr_port":"ATL     ","pax_total":39780.0},"19":{"arr_port":"EWR     ","pax_total":36890.0}}

In [47]:
# Bonus exercise: match searches with bookings including cancellations

In [4]:
df_bookings=pd.read_table('C:/amadeus/bookings.csv',sep='^',usecols=['arr_port','dep_port'])

In [5]:
df_bookings['flag']=1

In [6]:
df_bookings['arr_port']=df_bookings.arr_port.str.strip(' ')

In [7]:
df_bookings['dep_port']=df_bookings.dep_port.str.strip(' ')

In [46]:
df_bookings.rename(columns={'arr_port':'Destination','dep_port':'Origin'}, inplace=True)

In [46]:
df_searches=pd.read_table('C:/amadeus/searches.csv',sep='^',usecols=['Date','Origin','Destination'])

In [46]:
df_merge = pd.merge(df_searches, df_bookings, how='left', on=['Origin','Destination'])

In [46]:
df_merge=df_merge.fillna(0)

In [47]:
df_merge.to_csv('Converted_Searches.csv')

In [49]:
# In the output, there will be extra column called flag, when flag=1(search converted to booking) ,flag=0 (search not converted to booking)

In [50]:
#--------------------------------------End of Exercises including Bonus Exercises--------------------------------------------------------